In [1]:
import os
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import json

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from keras import utils

utils.set_random_seed(42)

In [2]:
# Define Parameters
LOOKBACK = 1
HORIZON = 24
N_SPLITS = 4
BATCH_SIZE = 32
EPOCHS = 20
model_name='base_noundersampling'

In [3]:
with open('../../../config.json', 'r') as config_file:
    config = json.load(config_file)

df = pd.read_csv(os.path.join(config['fulldataset'], 'cell_labeled.csv'))

print(df.shape)
df.head()

(12868886, 11)


Unnamed: 0,timestamp,cell,bts,antenna,carrier,minRSSI,PageSessionTotal,ULvolMByte,AnomalyDay,anomaly,noise
0,2023-09-01 02:30:00+00:00,997_0_0,997,0,0,-109.08,0,0.0,0,0,0
1,2023-09-01 02:30:00+00:00,580_0_1,580,0,1,-103.58,67,0.150356,0,0,0
2,2023-09-01 02:30:00+00:00,580_1_0,580,1,0,-107.79,0,0.352035,0,0,0
3,2023-09-01 02:30:00+00:00,580_1_1,580,1,1,-107.16,58,0.215608,0,0,0
4,2023-09-01 02:30:00+00:00,580_2_0,580,2,0,-106.63,0,0.184871,0,0,0


In [4]:
temporal_X = []
static_X = []

# Funcs

In [5]:
# Time series split function (Expanding Window)
def time_series_split(df, n_splits=N_SPLITS, test_size=0.2):
    df = df.sort_values('timestamp')
    test_split_index = int(len(df) * (1 - test_size))
    train_val_df = df.iloc[:test_split_index]
    test_df = df.iloc[test_split_index:]

    tscv = TimeSeriesSplit(n_splits=n_splits)
    splits = [(train_val_df.iloc[train_index], train_val_df.iloc[val_index]) for train_index, val_index in tscv.split(train_val_df)]
    return splits, test_df

In [6]:
# Sequence creation for univariate time series
def create_sequences(df, lookback=LOOKBACK, horizon=HORIZON):
    X, y, anomaly, cell_id = [], [], [], []

    # Loop through each unique cell in the dataset
    for cell in df['cell'].unique():
        # Filter the dataframe for the current cell only
        cell_df = df[df['cell'] == cell]

        # Generate sequences within this cell's data
        for i in range(lookback, len(cell_df) - horizon + 1):
            # Lookback sequence for minRSSI only (univariate)
            X_seq = cell_df.iloc[i - lookback:i][['minRSSI']].values
            # Target horizon sequence for minRSSI
            y_seq = cell_df.iloc[i:i + horizon]['minRSSI'].values
            # Anomaly sequences for later evaluation
            anomaly_seq = cell_df.iloc[i:i + horizon]['anomaly'].values
            # Cell ID for each sequence
            cell_seq = cell_df.iloc[i:i + horizon]['cell'].values

            # Append sequences to output lists
            X.append(X_seq)
            y.append(y_seq)
            anomaly.append(anomaly_seq)
            cell_id.append(cell_seq)

    # Convert lists to numpy arrays for model input
    return np.array(X), np.array(y), np.array(anomaly), np.array(cell_id)

In [7]:
def scale_data_split(train_df, val_df, temporal_features=temporal_X, static_features=static_X):
    scaler_temporal = StandardScaler()
    scaler_static = MinMaxScaler()
    scaler_target = StandardScaler()

    # Scale time-variant features
    if temporal_features:
        train_df[temporal_features] = scaler_temporal.fit_transform(train_df[temporal_features])
        val_df[temporal_features] = scaler_temporal.transform(val_df[temporal_features])

    # Scale time-invariant features
    if static_features:
        train_df[static_features] = scaler_static.fit_transform(train_df[static_features])
        val_df[static_features] = scaler_static.transform(val_df[static_features])

    # Scale minRSSI separately (target variable)
    train_df['minRSSI'] = scaler_target.fit_transform(train_df[['minRSSI']])
    val_df['minRSSI'] = scaler_target.transform(val_df[['minRSSI']])

    return train_df, val_df, scaler_target, scaler_temporal, scaler_static

In [8]:
def train_validate_dumb(splits, lookback=LOOKBACK, horizon=HORIZON):
    results = []
    scalers = {}
    total_training_time = 0

    for i, (train_df, val_df) in enumerate(splits):
        print(f"\nProcessing Split {i + 1}/{len(splits)}")

        # Scale the current split
        scaled_train, scaled_val, scaler_target, scaler_temporal, scaler_static = scale_data_split(train_df.copy(), val_df.copy())
        scalers = {'scaler_target': scaler_target, 'scaler_temporal': scaler_temporal, 'scaler_static': scaler_static}

        # Create sequences
        X_train, y_train, _, _ = create_sequences(scaled_train, LOOKBACK, HORIZON)
        X_val, y_val, val_anomalies, _ = create_sequences(scaled_val, LOOKBACK, HORIZON)

        print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
        print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")

        # Start timer
        start_time = time.time()

        # Dummy model predictions for validation (use the last minRSSI value)
        y_pred_baseline = X_val[:, -1, 0].reshape(-1, 1).repeat(HORIZON, axis=1)

        # End timer
        split_training_time = time.time() - start_time
        total_training_time += split_training_time

        # Evaluate results
        y_val_og = scalers['scaler_target'].inverse_transform(y_val)
        y_pred_og = scalers['scaler_target'].inverse_transform(y_pred_baseline)

        mae = mean_absolute_error(y_val_og, y_pred_og)
        rmse = np.sqrt(mean_squared_error(y_val_og, y_pred_og))

        # Anomaly-based evaluation
        anom_mae, anom_rmse = [], []
        for step in range(HORIZON):
            step_anomaly_mask = val_anomalies[:, step] == 1
            if np.any(step_anomaly_mask):
                anom_mae.append(mean_absolute_error(y_val_og[step_anomaly_mask, step], y_pred_og[step_anomaly_mask, step]))
                anom_rmse.append(np.sqrt(mean_squared_error(y_val_og[step_anomaly_mask, step], y_pred_og[step_anomaly_mask, step])))
            else:
                anom_mae.append(np.nan)
                anom_rmse.append(np.nan)

        results.append({'split': i + 1, 'Overall_MAE': mae, 'Overall_RMSE': rmse,
                        'Anom_MAE': np.nanmean(anom_mae), 'Anom_RMSE': np.nanmean(anom_rmse)})

        print(f"Split {i + 1} - Overall MAE: {mae:.4f}, Overall RMSE: {rmse:.4f}, "
              f"Anomaly MAE: {np.nanmean(anom_mae):.4f}, Anomaly RMSE: {np.nanmean(anom_rmse):.4f}")

    # Convert total training time to minutes and seconds format
    minutes, seconds = divmod(total_training_time, 60)

    # Aggregate results
    avg_overall_mae = np.mean([res['Overall_MAE'] for res in results])
    avg_overall_rmse = np.mean([res['Overall_RMSE'] for res in results])
    avg_overall_anom_mae = np.nanmean([res['Anom_MAE'] for res in results])
    avg_overall_anom_rmse = np.nanmean([res['Anom_RMSE'] for res in results])

    summary_results = {
        'Average Overall MAE': avg_overall_mae,
        'Average Overall RMSE': avg_overall_rmse,
        'Average Overall Anomaly MAE': avg_overall_anom_mae,
        'Average Overall Anomaly RMSE': avg_overall_anom_rmse,
        'Total Training Time': f"{int(minutes)}m {int(seconds)}s"
    }

    return summary_results, scalers

In [9]:
def evaluate_dumb(test_df, scalers, lookback=LOOKBACK, horizon=HORIZON):
    print("Preparing Baseline Test Data...")

    # Normalize data
    test_df['minRSSI'] = scalers['scaler_target'].transform(test_df[['minRSSI']])

    # Prepare test sequences
    X_test, y_test, test_anomalies, test_cells = create_sequences(test_df, lookback, horizon)

    # Baseline predictions: Repeat the last value from each sequence in the LOOKBACK period
    y_pred_baseline = X_test[:, -1, 0].reshape(-1, 1).repeat(horizon, axis=1)

    # Inverse transform predictions and actual values
    y_test_original = scalers['scaler_target'].inverse_transform(y_test).reshape(-1, horizon)
    y_pred_original = scalers['scaler_target'].inverse_transform(y_pred_baseline).reshape(-1, horizon)

    # Calculate per-step MAE and RMSE
    per_step_mae, per_step_rmse = [], []
    per_step_anom_mae, per_step_anom_rmse = [], []

    print("\nEvaluating Baseline...")
    for step in range(horizon):
        # Calculate general per-step metrics (MAE, RMSE)
        mae_step = mean_absolute_error(y_test_original[:, step], y_pred_original[:, step])
        rmse_step = np.sqrt(mean_squared_error(y_test_original[:, step], y_pred_original[:, step]))
        per_step_mae.append(mae_step)
        per_step_rmse.append(rmse_step)

        # Anomaly-specific metrics (only considering values where anomaly == 1)
        step_anomaly_mask = test_anomalies[:, step] == 1
        if np.any(step_anomaly_mask):
            anom_mae_step = mean_absolute_error(y_test_original[step_anomaly_mask, step], y_pred_original[step_anomaly_mask, step])
            anom_rmse_step = np.sqrt(mean_squared_error(y_test_original[step_anomaly_mask, step], y_pred_original[step_anomaly_mask, step]))
        else:
            anom_mae_step, anom_rmse_step = np.nan, np.nan

        per_step_anom_mae.append(anom_mae_step)
        per_step_anom_rmse.append(anom_rmse_step)

    # Calculate overall MAE and RMSE across all steps
    overall_mae = np.mean(per_step_mae)
    overall_rmse = np.mean(per_step_rmse)
    overall_anom_mae = np.nanmean(per_step_anom_mae)
    overall_anom_rmse = np.nanmean(per_step_anom_rmse)

    print(f"Baseline Test MAE: {overall_mae:.4f}, Test RMSE: {overall_rmse:.4f}")
    print(f"Baseline Test Anomaly MAE: {overall_anom_mae:.4f}, Test Anomaly RMSE: {overall_anom_rmse:.4f}")

    # Create a DataFrame for predictions to save for plotting
    predictions = []
    for i in range(y_test_original.shape[0]):  # Iterate over each sample (cell)
        row = {'cell_id': test_cells[i][0]}

        for step in range(horizon):
            row[f'actual_{step+1}'] = y_test_original[i, step]
            row[f'predicted_{step+1}'] = y_pred_original[i, step]
            row[f'anomaly_{step+1}'] = test_anomalies[i, step]

        predictions.append(row)

    # Create a DataFrame for predictions to save or plot later
    predictions_df = pd.DataFrame(predictions)

    # Return the results dictionary (evaluation metrics) and predictions DataFrame
    results = {
        'MAE_per_step': per_step_mae,
        'RMSE_per_step': per_step_rmse,
        'Anom_MAE_per_step': per_step_anom_mae,
        'Anom_RMSE_per_step': per_step_anom_rmse,
        'Overall_MAE': overall_mae,
        'Overall_RMSE': overall_rmse,
        'Overall_Anomaly_MAE': overall_anom_mae,
        'Overall_Anomaly_RMSE': overall_anom_rmse
    }

    return results, predictions_df

In [10]:
def plot_predictions(pred_df_path, cell_id, horizon):
    # Load the predictions DataFrame
    pred_df = pd.read_csv(pred_df_path)
    
    # Filter the DataFrame for the specified cell_id
    cell_data = pred_df[pred_df['cell_id'] == cell_id]
    
    # Extract the actual, predicted values and anomalies for the specified horizon
    actual_col = f'actual_{horizon}'
    predicted_col = f'predicted_{horizon}'
    anomaly_col = f'anomaly_{horizon}'
    
    actual_values = cell_data[actual_col].values
    predicted_values = cell_data[predicted_col].values
    anomalies = cell_data[anomaly_col].values
    
    # Plot the actual and predicted values
    plt.figure(figsize=(6, 4))
    plt.plot(actual_values, label='Actual', color='blue')
    plt.plot(predicted_values, label='Predicted', color='orange')
    
    # Highlight anomalies with red dots
    anomaly_indices = anomalies == 1
    plt.scatter(np.arange(len(actual_values))[anomaly_indices], 
                actual_values[anomaly_indices], color='red', label='Anomaly', marker='o', s=30, edgecolors='k', zorder=5)

    if horizon == 1:
        time_step_desc = "30 minutes"
    else:
        time_step_desc = f"{horizon * 0.5} hours"  # Each horizon step is 30 minutes
    
    # Add labels and legend
    plt.xlabel('Time Step')
    plt.ylabel('minRSSI')
    plt.legend()
    plt.tight_layout()
    plt.show()

# Prepare data

In [11]:
splits, test_set = time_series_split(df, 4)

for i, (train, val) in enumerate(splits):
    print(f"Split {i + 1}:")
    print(f"  Train set shape: {train.shape}")
    print(f"  Validation set shape: {val.shape}")

print(f"Test set shape: {test_set.shape}")

Split 1:
  Train set shape: (2059024, 11)
  Validation set shape: (2059021, 11)
Split 2:
  Train set shape: (4118045, 11)
  Validation set shape: (2059021, 11)
Split 3:
  Train set shape: (6177066, 11)
  Validation set shape: (2059021, 11)
Split 4:
  Train set shape: (8236087, 11)
  Validation set shape: (2059021, 11)
Test set shape: (2573778, 11)


# Train model

In [14]:
# Train and evaluate the model across all splits
summary_results, scalers = train_validate_dumb(splits, lookback=LOOKBACK, horizon=HORIZON)


Processing Split 1/4
X_train shape: (2042824, 1, 1), y_train shape: (2042824, 24)
X_val shape: (2042821, 1, 1), y_val shape: (2042821, 24)


  'Anom_MAE': np.nanmean(anom_mae), 'Anom_RMSE': np.nanmean(anom_rmse)})
  f"Anomaly MAE: {np.nanmean(anom_mae):.4f}, Anomaly RMSE: {np.nanmean(anom_rmse):.4f}")


Split 1 - Overall MAE: 0.5396, Overall RMSE: 1.0366, Anomaly MAE: nan, Anomaly RMSE: nan

Processing Split 2/4
X_train shape: (4101845, 1, 1), y_train shape: (4101845, 24)
X_val shape: (2042821, 1, 1), y_val shape: (2042821, 24)
Split 2 - Overall MAE: 0.6287, Overall RMSE: 1.3189, Anomaly MAE: 17.5851, Anomaly RMSE: 26.6654

Processing Split 3/4
X_train shape: (6160866, 1, 1), y_train shape: (6160866, 24)
X_val shape: (2042821, 1, 1), y_val shape: (2042821, 24)
Split 3 - Overall MAE: 0.7537, Overall RMSE: 1.5877, Anomaly MAE: 13.4217, Anomaly RMSE: 14.4786

Processing Split 4/4


: 

In [12]:
# Run the train_validate_dumb function on the fourth split only
split4 = [splits[3]]  # Extract the fourth split
summary_results_split4, scalers = train_validate_dumb(split4, lookback=LOOKBACK, horizon=HORIZON)

# Print the results for the fourth split
print("\nResults for the Fourth Split:")
for metric, value in summary_results_split4.items():
    try:
        print(f"{metric}: {float(value):.4f}")
    except ValueError:
        print(f"{metric}: {value}")


Processing Split 1/1
X_train shape: (8219887, 1, 1), y_train shape: (8219887, 24)
X_val shape: (2042821, 1, 1), y_val shape: (2042821, 24)
Split 1 - Overall MAE: 0.8064, Overall RMSE: 1.7110, Anomaly MAE: 12.3295, Anomaly RMSE: 13.8461

Results for the Fourth Split:
Average Overall MAE: 0.8064
Average Overall RMSE: 1.7110
Average Overall Anomaly MAE: 12.3295
Average Overall Anomaly RMSE: 13.8461
Total Training Time: 0m 0s


# Evaluate

In [None]:
test_results, test_preds = evaluate_dumb(test_set, scalers)

Preparing Baseline Test Data...

Evaluating Baseline...
Baseline Test MAE: 1.0169, Test RMSE: 2.1695
Baseline Test Anomaly MAE: 9.4446, Test Anomaly RMSE: 10.8702
