In [2]:
from extract_features import process_dataset, load_and_process_sample
from visualization import signal_viewer
from imu_pipeline import IMUPipeline
from pathlib import Path
import pandas as pd
import joblib
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

In [None]:
signal_viewer(
    data_dir=Path('data/raw/train'),
    labels_csv=Path('data/train.csv')
)

In [None]:
#Loading train data
train_df = pd.read_csv('data/train.csv')
print("Size of training set:", train_df.shape)
print("Few first rows of training set:\n", train_df.head())
display(train_df.head())

#Some basic stats
print("\nStatistics by numerical features:")
display(train_df.describe())

#Distribution of labels
print("\nDistribution of labels:")
display(train_df['label'].value_counts())

In [None]:
# Generate features for inference set
print("# Generate features for inference set...")
inference_df = process_dataset('inference')
print("Done")

#loading test data
test_df = pd.read_csv('data/test.csv')
print("\nSize of test dataset:", test_df.shape)
print("Size of inference datase:", inference_df.shape)

# Comparison of feature distribution
print("\nComparison of feature distribution:")
print("\nTest dataset:")
display(test_df.describe())
print("\nInference dataset:")
display(inference_df.describe())

#Comparison of avg values
print("\nComparison of features avg values:")
mean_comparison = pd.DataFrame({
    'test_mean': test_df.select_dtypes(include=[np.number]).mean(),
    'inference_mean': inference_df.select_dtypes(include=[np.number]).mean(),
    'difference': test_df.select_dtypes(include=[np.number]).mean() - inference_df.select_dtypes(include=[np.number]).mean()
})
display(mean_comparison)

#Adding plots
plt.figure(figsize=(15, 6))
plt.bar(mean_comparison.index, mean_comparison['difference'] )
plt.xticks(rotation=90)
plt.title('Difference between test and inference features')
plt.tight_layout()
plt.show()

if 'label' in test_df.columns:
    print("\nDistribution of labels in test dataset:")
    display(test_df['label'].value_counts())

In [None]:
# List of key features to visualize
key_features = ['x_energy', 'y_energy', 'z_energy', 'sudden_change_score', 'x_fft_max', 'y_fft_max', 'z_fft_max']

for feature in key_features:
    plt.figure(figsize=(8, 4))
    plt.hist(test_df[feature], bins=30, alpha=0.5, label='Test')
    plt.hist(inference_df[feature], bins=30, alpha=0.5, label='Inference')
    plt.title(f'Distribution of {feature}')
    plt.legend()
    plt.show()

# Calibration status distribution
if 'calibration_status' in inference_df.columns:
    print("Calibration status in inference set:")
    print(inference_df['calibration_status'].value_counts())
    inference_df['calibration_status'].value_counts().plot(kind='bar', title='Calibration Status Distribution')
    plt.show()

# Weather distribution
if 'weather' in inference_df.columns:
    print("Weather in inference set:")
    print(inference_df['weather'].value_counts())
    inference_df['weather'].value_counts().plot(kind='bar', title='Weather Distribution')
    plt.show()

# Device model distribution
if 'device_model' in inference_df.columns:
    print("Device model in inference set:")
    print(inference_df['device_model'].value_counts())
    inference_df['device_model'].value_counts().plot(kind='bar', title='Device Model Distribution')
    plt.show()

In [None]:
#Loading model
model = joblib.load("models/imu_pipeline.pkl")

#Loading labels for inference dataset
inference_labels = pd.read_csv('data/manual_annotation/inference_labels.csv')
print("\nShape of inference dataset:", inference_df.shape)
print("Shape of inference labels:", inference_labels.shape)

In [None]:
#Predictions
predictions = model.predict(inference_df)
probabilities = model.predict_proba(inference_df)

#Joining predictions and real labels
results_df = pd.DataFrame({
    'sample_id': inference_df['sample_id'],
    'predicted': predictions,
    'actual': inference_labels['label']
})

#Accuracy calculation
accuracy = (results_df['predicted'] == results_df['actual']).mean()
print(f"Accuracy in inference dataset: {accuracy:.2%}")

In [None]:
#Error matrix
cm = confusion_matrix(results_df["actual"], results_df["predicted"])
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title("Error matrix (inference dataset)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

#Classification report
print("\nClassification report (inference dataset):")
print(classification_report(results_df["actual"], results_df["predicted"]))


#If test_df has labels
if 'label' in test_df.columns:
    test_predictions = model.predict(test_df)
    test_accuracy = (test_predictions == test_df['label']).mean()
    print(f"Accuracy in test dataset: {test_accuracy:.2%}")
    print("\nClassification report (test dataset):")
    print(classification_report(test_df['label'], test_predictions))

# ❓ Questions to Reflect On

---

### 1. What do you observe when comparing the model's predictions on the new data to its known performance?

I observe a significant drop in model performance:
- **On test set:** accuracy 94.50%
- **On inference set:** accuracy 51.37%

Particularly noticeable issues with the "normal" class:
- **On test:** precision 0.90, recall 1.00
- **On inference:** precision 0.54, recall 0.20

This indicates that the model poorly generalizes to new data, especially in normal driving scenarios.

---

### 2. Is there anything in the data that might explain differences in behavior?

Yes, I discovered significant differences in the data:
1. The mean values of signal energy features (`x_energy`, `y_energy`, `z_energy`) are much higher in the inference set than in the test set. This suggests that the new data contains more intense or dynamic movements, or that the sensor characteristics or preprocessing have changed.
2. Features related to sudden changes (`sudden_change_score`) and frequency characteristics (`fft_max`, `fft_mean`) also differ.
3. The distribution of feature values in the inference set substantially differs from the test set.

These differences explain why the model trained on one data distribution performs poorly on another.

---

### 3. Can you identify patterns or trends related to when the model succeeds or fails?

Yes, I can identify the following patterns:

**Model successes:**
- Better at detecting collisions than normal situations.
- Has high recall (0.83) for the collision class.

**Model failures:**
- Poor at detecting normal situations (recall 0.20).
- Has low precision for both classes (0.51 and 0.54).
- Frequently confuses normal situations with collisions.

---

### 4. Are there signals or features that seem to affect the model's reliability?

Yes, I identified key features affecting model reliability:

1. **Energy features:**
   - `x_energy`, `y_energy`, `z_energy`
   - These features show the greatest difference between datasets.
2. **Sudden change features:**
   - `sudden_change_score`
   - `max_delta_mag`
   - These features also differ significantly in new data.
3. **Frequency characteristics:**
   - `fft_max`
   - `fft_mean`
   - Show significant differences between datasets.

---

### 5. What could be done in the short term to handle the current situation?

**Short-term solutions:**
1. **Model recalibration:**
   - Retrain the model using the inference dataset to better adapt to the new data distribution.
   - Adjust decision thresholds for key features if retraining is not possible.
2. **Feature adaptation:**
   - Normalize energy-related features (such as `x_energy`, `y_energy`, `z_energy`) to reduce the impact of scale differences between datasets.
   - Consider using relative or standardized feature values instead of absolute ones.
3. **Regularization:**
   - Apply L1 or L2 regularization to improve the model's robustness to feature distribution shifts.
   - Tune the regularization parameter to achieve the best balance between bias and variance.

---

### 6. What are potential long-term steps to improve model performance in similar scenarios?

**Long-term solutions:**
1. **Enhance data collection:**
   - Gather a more diverse dataset that better represents real-world scenarios.
   - Ensure data is collected under a variety of conditions (e.g., different weather, road types, and vehicle speeds).
   - Increase the number of samples for each class to improve class balance and model generalization.
2. **Improve feature engineering:**
   - Develop more robust and informative features that are less sensitive to changes in data distribution.
   - Utilize relative or standardized metrics where appropriate.
   - Incorporate contextual features that capture additional information about the driving environment or situation.
3. **Advance model development:**
   - Explore ensemble methods to increase predictive stability and accuracy.
   - Investigate adaptive learning techniques to help the model adjust to new data over time.
   - Implement a continuous learning pipeline to regularly update the model as new labeled data becomes available.

---

### 7. What would you want to explore further if given more time or data?

**Additional research:**
1. **Data analysis:**
   - Study the impact of metadata (weather, road type, speed).
   - Analyze temporal patterns.
   - Investigate feature correlations.
2. **Model improvement:**
   - Try other algorithms.
   - Research ensemble methods.
   - Test deep learning.
3. **Validation:**
   - Collect more test data.
   - Conduct A/B testing of different approaches.
   - Evaluate performance in real-world conditions.

---

### 8. What assumptions did the model rely on during training — and are they still valid?

1. **Data distribution stability:**  
   The model assumed that the feature distributions in the training and future data would be similar. In reality, there is a significant distribution shift, which leads to degraded performance.
2. **Sensor and processing consistency:**  
   It was assumed that devices, calibration, and data processing would remain unchanged. In practice, differences in hardware, firmware, or collection conditions can affect the data.
3. **Labeling consistency:**  
   The model relied on consistent class labeling rules. Any changes in annotation guidelines could negatively impact results.
4. **Feature relevance:**  
   It was assumed that the selected features would remain equally informative for new data. However, due to distribution shifts, their importance may have changed.

**Conclusion:**  
Most of these assumptions did not fully hold, which explains the drop in model performance on new data. The model needs to be adapted, and data collection and processing procedures should be improved.