In [None]:
# ==============================================================================
# TCNonOwnData_v2.ipynb
#
# Unified Training and Android Asset Export for HAR
#
# This notebook provides a complete, end-to-end pipeline:
# 1. Loads and preprocesses sensor data.
# 2. Trains a high-accuracy TCN Keras model.
# 3. Evaluates the Keras model to establish a performance benchmark.
# 4. Converts the Keras model to a high-quality, quantized TFLite model
#    suitable for Android deployment.
# 5. Exports the necessary scaler and label metadata as JSON files.
# 6. Verifies the TFLite model's accuracy to ensure the conversion was
#    successful, preventing the common accuracy drop issue.
#
# Author: Your Name
# Date: [Current Date]
# ==============================================================================

# --- Block 1: Setup and Initial Configuration ---
print("--- Block 1: Setup and Initial Configuration ---")

# Install necessary packages quietly
!pip install joblib -q

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
import time
import json
from pathlib import Path

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Add
from tensorflow.keras.layers import Conv1D, SpatialDropout1D, GlobalAveragePooling1D, Activation
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.regularizers import l2

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.utils import class_weight

# --- Mount Google Drive ---
from google.colab import drive
drive.mount('/content/drive')

# --- Main Configuration ---
# *** Define the input files using Colab paths ***
drive_base_path = Path('/content/drive/MyDrive/Colab_HAR_Project/data')
INPUT_CSV_1 = drive_base_path / 'resampled_normalized_phone_data.csv'
INPUT_CSV_2 = drive_base_path / 'combined_collected_data.csv'

ACTIVITIES_FROM_FILE1 = ['B', 'D', 'E']
ACTIVITIES_FROM_FILE2 = ['A', 'C']
ALL_ACTIVITIES_TO_KEEP = sorted(ACTIVITIES_FROM_FILE1 + ACTIVITIES_FROM_FILE2)

# *** Define a NEW output directory for this verified run ***
output_drive_path = Path('/content/drive/MyDrive/Colab_HAR_Project/results')
OUTPUT_DIR = output_drive_path / 'TCN_Results_v2_with_Assets' # New, clean directory
FILE_PREFIX = f"tcn_v2_{''.join(ALL_ACTIVITIES_TO_KEEP)}_"

# Create the output directory
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
print(f"All outputs will be saved to: {OUTPUT_DIR}")


# --- Windowing, Model, Training Parameters ---
WINDOW_SIZE = 60
STRIDE = 15
KERNEL_SIZE = 7
NUM_FILTERS = 64
NUM_TCN_BLOCKS = 5
DILATION_RATES = [2**i for i in range(NUM_TCN_BLOCKS)]
SPATIAL_DROPOUT_RATE = 0.15
FINAL_DROPOUT_RATE = 0.3
L2_REG = 1e-4
BATCH_SIZE = 64
EPOCHS = 100
VALIDATION_SPLIT = 0.2
EARLY_STOPPING_PATIENCE = 15

# --- GPU Check ---
print("\n--- GPU Check ---")
gpu_devices = tf.config.list_physical_devices('GPU')
if gpu_devices:
    print(f"Found {len(gpu_devices)} GPU(s). Enabling memory growth.")
    try:
        for gpu in gpu_devices:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(f"Could not set memory growth: {e}")
else:
    print("!!! No GPU found. Training will use CPU. !!!")
print("-" * 25)

# ==============================================================================
# --- Block 2: Data Loading and Preprocessing ---
# ==============================================================================
print("\n--- Block 2: Data Loading and Preprocessing ---")

def create_subject_activity_windows(df, window_size, stride):
    windows, labels, subject_ids = [], [], []
    required_cols = ['x_accel', 'y_accel', 'z_accel', 'x_gyro', 'y_gyro', 'z_gyro']
    grouped = df.groupby(['subject', 'activity'])
    print(f"Processing {len(grouped)} subject-activity groups for windowing...")
    for name, group_df in grouped:
        data_values = group_df.sort_values('timestamp')[required_cols].values
        if len(data_values) < window_size: continue
        for start in range(0, len(data_values) - window_size + 1, stride):
            windows.append(data_values[start : start + window_size])
            labels.append(group_df["activity"].iloc[0])
            subject_ids.append(group_df["subject"].iloc[0])
    if not windows: raise ValueError("No windows created.")
    return np.array(windows), np.array(labels), np.array(subject_ids)

# --- Load and combine data ---
df1 = pd.read_csv(INPUT_CSV_1)
df2 = pd.read_csv(INPUT_CSV_2)
df1_filtered = df1[df1['activity'].isin(ACTIVITIES_FROM_FILE1)]
df2_filtered = df2[df2['activity'].isin(ACTIVITIES_FROM_FILE2)]
combined_df = pd.concat([df1_filtered, df2_filtered], ignore_index=True)
print(f"Combined dataset has {len(combined_df)} rows.")

# --- Create windows ---
X, y_raw, subjects = create_subject_activity_windows(combined_df, WINDOW_SIZE, STRIDE)
print(f"Created {len(X)} windows with shape {X.shape}")

# --- Encode labels ---
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)
num_classes = len(label_encoder.classes_)
print(f"Encoded {num_classes} classes: {list(label_encoder.classes_)}")

# --- Split data (Train/Test) ---
X_train, X_test, y_train, y_test, subjects_train, subjects_test = train_test_split(
    X, y, subjects, test_size=0.25, random_state=42, stratify=y)

# --- Scale features ---
# The scaler is fit ONLY on the training data to prevent data leakage.
# This 'scaler' object is the single source of truth for all subsequent steps.
num_samples_train, timesteps, num_features = X_train.shape
scaler = StandardScaler()
X_train_reshaped = X_train.reshape(-1, num_features)
X_train_scaled_reshaped = scaler.fit_transform(X_train_reshaped)
X_train_scaled = X_train_scaled_reshaped.reshape(num_samples_train, timesteps, num_features)

# Apply the SAME scaler to the test data
X_test_reshaped = X_test.reshape(-1, num_features)
X_test_scaled_reshaped = scaler.transform(X_test_reshaped)
X_test_scaled = X_test_scaled_reshaped.reshape(X_test.shape[0], timesteps, num_features)
print("Feature scaling complete. Scaler is fit and ready.")


# ==============================================================================
# --- Block 3: TCN Model Definition and Training ---
# ==============================================================================
print("\n--- Block 3: TCN Model Definition and Training ---")

def residual_block(x, dilation_rate, nb_filters, kernel_size, dropout_rate=0.0):
    prev_x = x
    conv1 = Conv1D(filters=nb_filters, kernel_size=kernel_size, dilation_rate=dilation_rate, padding='same')(x)
    conv1 = BatchNormalization()(conv1); conv1 = Activation('relu')(conv1); conv1 = SpatialDropout1D(dropout_rate)(conv1)
    conv2 = Conv1D(filters=nb_filters, kernel_size=kernel_size, dilation_rate=dilation_rate, padding='same')(conv1)
    conv2 = BatchNormalization()(conv2); conv2 = Activation('relu')(conv2); conv2 = SpatialDropout1D(dropout_rate)(conv2)
    if prev_x.shape[-1] != conv2.shape[-1]: prev_x = Conv1D(nb_filters, 1, padding='same')(prev_x)
    return Add()([prev_x, conv2])

def build_tcn_model(input_shape, num_classes):
    input_layer = Input(shape=input_shape)
    x = input_layer
    for rate in DILATION_RATES:
        x = residual_block(x, rate, NUM_FILTERS, KERNEL_SIZE, SPATIAL_DROPOUT_RATE)
    x = GlobalAveragePooling1D()(x)
    x = Dropout(FINAL_DROPOUT_RATE)(x)
    output_layer = Dense(num_classes, activation='softmax')(x)
    return Model(inputs=input_layer, outputs=output_layer)

model = build_tcn_model((WINDOW_SIZE, num_features), num_classes)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# --- Callbacks and Class Weights ---
checkpoint_path = OUTPUT_DIR / f'{FILE_PREFIX}har_model.keras'
model_checkpoint = ModelCheckpoint(filepath=checkpoint_path, monitor='val_accuracy', save_best_only=True, mode='max', verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', patience=EARLY_STOPPING_PATIENCE, restore_best_weights=True, verbose=1)

class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))
print(f"Class weights computed: {class_weights_dict}")

# --- Train the model ---
history = model.fit(
    X_train_scaled, y_train,
    epochs=EPOCHS, batch_size=BATCH_SIZE,
    validation_split=VALIDATION_SPLIT,
    callbacks=[model_checkpoint, reduce_lr, early_stopping],
    class_weight=class_weights_dict,
    verbose=1
)


# ==============================================================================
# --- Block 4: Keras Model Evaluation (Benchmark) ---
# ==============================================================================
print("\n--- Block 4: Keras Model Evaluation (Benchmark) ---")
print("Loading best Keras model weights saved during training...")
best_model = tf.keras.models.load_model(checkpoint_path)

loss, keras_accuracy = best_model.evaluate(X_test_scaled, y_test, verbose=0)
print(f"Keras Test Accuracy (Ground Truth): {keras_accuracy:.4f}")

# Generate and print classification report
y_pred_keras = np.argmax(best_model.predict(X_test_scaled), axis=1)
print("\nKeras Model Classification Report:")
print(classification_report(y_test, y_pred_keras, target_names=label_encoder.classes_))


print("\n--- Saving the definitive test set artifacts ---")
test_set_output_dir = OUTPUT_DIR / 'definitive_test_set'
test_set_output_dir.mkdir(parents=True, exist_ok=True)

# Save the scaled test data and the labels
np.save(test_set_output_dir / 'X_test_scaled.npy', X_test_scaled)
np.save(test_set_output_dir / 'y_test.npy', y_test)

print(f"Definitive test set saved to: {test_set_output_dir}")

# ==============================================================================
# --- Block 5: Exporting Assets for Android (TFLite, JSON) ---
# ==============================================================================
print("\n--- Block 5: Exporting Assets for Android (TFLite, JSON) ---")

# --- 5.1: Save Scaler as JSON ---
scaler_path_json = OUTPUT_DIR / f'{FILE_PREFIX}scaler.json'
scaler_dict = {"mean": scaler.mean_.tolist(), "scale": scaler.scale_.tolist()}
with open(scaler_path_json, 'w') as f:
    json.dump(scaler_dict, f, indent=4)
print(f"Scaler saved to: {scaler_path_json}")

# --- 5.2: Save Labels as JSON ---
labels_path_json = OUTPUT_DIR / f'{FILE_PREFIX}labels.json'
labels_dict = {str(i): label for i, label in enumerate(label_encoder.classes_)}
with open(labels_path_json, 'w') as f:
    json.dump(labels_dict, f, indent=4)
print(f"Labels saved to: {labels_path_json}")

# --- 5.3: Convert Keras model to Quantized TFLite ---
def representative_dataset_gen():
    """Yields a small, representative sample of the training data for quantization."""
    # Using the 'X_train_scaled' variable directly from memory ensures consistency.
    num_samples = min(300, len(X_train_scaled))
    for i in np.random.choice(len(X_train_scaled), num_samples, replace=False):
        yield [X_train_scaled[i:i+1].astype(np.float32)]

print("Starting TFLite conversion with full integer quantization...")
converter = tf.lite.TFLiteConverter.from_keras_model(best_model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = representative_dataset_gen
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8
converter.inference_output_type = tf.int8
tflite_quant_model = converter.convert()

tflite_model_path = OUTPUT_DIR / f'{FILE_PREFIX}har_model.tflite'
with open(tflite_model_path, 'wb') as f:
    f.write(tflite_quant_model)
print(f"Quantized TFLite model saved to: {tflite_model_path}")


# ==============================================================================
# --- Block 6: TFLite Model Verification ---
# ==============================================================================
print("\n--- Block 6: TFLite Model Verification ---")
# This block simulates how the Android app would use the assets to ensure
# there is no loss in accuracy due to the conversion process.

# --- Load the TFLite model and allocate tensors ---
interpreter = tf.lite.Interpreter(model_path=str(tflite_model_path))
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()[0]
output_details = interpreter.get_output_details()[0]

print("TFLite model loaded. Verifying performance on the test set...")
y_pred_tflite = []

# --- Loop through the test set and perform inference ---
# This mimics per-window prediction on a device.
for i in range(len(X_test)):
    # Get a single window of raw (unscaled) test data
    test_window_raw = X_test[i]

    # **Step A: Preprocessing (just like the app will do)**
    # 1. Scale the raw data using the same scaler
    test_window_scaled = scaler.transform(test_window_raw)

    # 2. Quantize the float32 data to int8 using the model's scale and zero-point
    input_scale, input_zero_point = input_details['quantization']
    test_window_quantized = (test_window_scaled / input_scale + input_zero_point).astype(np.int8)

    # **Step B: TFLite Inference**
    interpreter.set_tensor(input_details['index'], test_window_quantized[np.newaxis, ...])
    interpreter.invoke()

    # **Step C: Post-processing**
    # 1. Get the quantized output
    output_quantized = interpreter.get_tensor(output_details['index'])[0]

    # 2. De-quantize the output back to float32 probabilities
    output_scale, output_zero_point = output_details['quantization']
    output_dequantized = (output_quantized.astype(np.float32) - output_zero_point) * output_scale

    y_pred_tflite.append(np.argmax(output_dequantized))

# --- Calculate TFLite accuracy ---
y_pred_tflite = np.array(y_pred_tflite)
tflite_accuracy = accuracy_score(y_test, y_pred_tflite)

print(f"\nTFLite Model Test Accuracy: {tflite_accuracy:.4f}")
print("\nTFLite Model Classification Report:")
print(classification_report(y_test, y_pred_tflite, target_names=label_encoder.classes_))


# ==============================================================================
# --- Block 7: Final Report ---
# ==============================================================================
print("\n--- Block 7: Final Report ---")
print("="*40)
print("  VERIFICATION AND ACCURACY REPORT")
print("="*40)
print(f"Original Keras Model Accuracy: {keras_accuracy:.4f}")
print(f"Quantized TFLite Model Accuracy: {tflite_accuracy:.4f}")
accuracy_drop = keras_accuracy - tflite_accuracy
print(f"Accuracy Drop: {accuracy_drop:.4f} ({accuracy_drop*100:.2f}%)")

if accuracy_drop < 0.02:
    print("\nVERDICT: SUCCESS! The TFLite model's accuracy is very close to the original.")
    print("The assets are consistent and ready for deployment.")
else:
    print("\nVERDICT: WARNING! The accuracy drop is larger than expected (>2%).")
    print("Review the model architecture or quantization process.")
print("="*40)

# Final check of all created files in the output directory
print("\nFinal assets created in your Google Drive:")
!ls -lh {OUTPUT_DIR}

--- Block 1: Setup and Initial Configuration ---
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
All outputs will be saved to: /content/drive/MyDrive/Colab_HAR_Project/results/TCN_Results_v2_with_Assets

--- GPU Check ---
Found 1 GPU(s). Enabling memory growth.
-------------------------

--- Block 2: Data Loading and Preprocessing ---
Combined dataset has 850849 rows.
Processing 178 subject-activity groups for windowing...
Created 56083 windows with shape (56083, 60, 6)
Encoded 5 classes: [np.str_('A'), np.str_('B'), np.str_('C'), np.str_('D'), np.str_('E')]
Feature scaling complete. Scaler is fit and ready.

--- Block 3: TCN Model Definition and Training ---


Class weights computed: {0: np.float64(0.6842130947539651), 1: np.float64(0.9696173351775011), 2: np.float64(2.7772862330802246), 3: np.float64(0.9313994685562444), 4: np.float64(0.9316057585825027)}
Epoch 1/100
[1m526/526[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.7788 - loss: 0.6656
Epoch 1: val_accuracy improved from -inf to 0.94794, saving model to /content/drive/MyDrive/Colab_HAR_Project/results/TCN_Results_v2_with_Assets/tcn_v2_ABCDE_har_model.keras
[1m526/526[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 30ms/step - accuracy: 0.7789 - loss: 0.6651 - val_accuracy: 0.9479 - val_loss: 0.1427 - learning_rate: 0.0010
Epoch 2/100
[1m523/526[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 9ms/step - accuracy: 0.9283 - loss: 0.1941
Epoch 2: val_accuracy improved from 0.94794 to 0.95602, saving model to /content/drive/MyDrive/Colab_HAR_Project/results/TCN_Results_v2_with_Assets/tcn_v2_ABCDE_har_model.keras
[1m526/526[0m [32m━━━━━━━━



Quantized TFLite model saved to: /content/drive/MyDrive/Colab_HAR_Project/results/TCN_Results_v2_with_Assets/tcn_v2_ABCDE_har_model.tflite

--- Block 6: TFLite Model Verification ---
TFLite model loaded. Verifying performance on the test set...

TFLite Model Test Accuracy: 0.9939

TFLite Model Classification Report:
              precision    recall  f1-score   support

           A       1.00      0.99      1.00      4099
           B       1.00      1.00      1.00      2892
           C       0.99      1.00      0.99      1010
           D       0.99      0.99      0.99      3010
           E       0.98      1.00      0.99      3010

    accuracy                           0.99     14021
   macro avg       0.99      0.99      0.99     14021
weighted avg       0.99      0.99      0.99     14021


--- Block 7: Final Report ---
  VERIFICATION AND ACCURACY REPORT
Original Keras Model Accuracy: 0.9965
Quantized TFLite Model Accuracy: 0.9939
Accuracy Drop: 0.0026 (0.26%)

VERDICT: SUCCESS! 