In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import matthews_corrcoef, classification_report, accuracy_score

# ===============================
# 1. loading data and initial processing
# ===============================
df = pd.read_csv('uniq2_table.csv')

required_cols = [
    'data', 'Time',
    'peak_usa', 'peak_germany', 'peak_australia',
    'peak_italy', 'peak_iceland', 'peak_uk',
    'bin_flag_peak_usa', 'bin_flag_peak_germany', 'bin_flag_peak_australia',
    'bin_flag_peak_italy', 'bin_flag_peak_iceland', 'bin_flag_peak_uk'
]
assert all(col in df.columns for col in required_cols), "missing required columns!"
df.dropna(subset=required_cols, inplace=True)

cases = df['data'].unique()
train_cases, test_cases = train_test_split(cases, test_size=0.2, random_state=42)

# ===============================
# 2. function for creating X and y 
# ===============================
def build_samples_for_case(df_case):
    station_list = [
        'peak_usa', 'peak_germany', 'peak_australia',
        'peak_italy', 'peak_iceland', 'peak_uk'
    ]
    binflag_list = [
        'bin_flag_peak_usa', 'bin_flag_peak_germany', 'bin_flag_peak_australia',
        'bin_flag_peak_italy', 'bin_flag_peak_iceland', 'bin_flag_peak_uk'
    ]
    bins = np.arange(0, 241, 30)  # 0,30,60,...,240
    df_case = df_case.copy()
    df_case['time_bin'] = pd.cut(df_case['Time'], bins=bins, right=False)
    bins_intervals = df_case['time_bin'].cat.categories

    X_case_all = []
    y_case_all = []

    for station, flag_col in zip(station_list, binflag_list):
        bin_slices = []
        bin_labels = []
        valid = True

        for interval in bins_intervals:
            mask = (df_case['time_bin'] == interval)
            arr_signal = df_case.loc[mask, station].values
            arr_flag = df_case.loc[mask, flag_col].values

            count = arr_signal.shape[0]
            if count == 0:
                valid = False
                break

            if count < 1500:
                last_val = arr_signal[-1]
                padding = np.full((1500 - count,), last_val, dtype=arr_signal.dtype)
                arr_signal = np.concatenate([arr_signal, padding])
                flag_value = int(arr_flag[0]) if arr_flag.size > 0 else 0
            else:
                arr_signal = arr_signal[:1500]
                flag_value = int(arr_flag[0])

            bin_slices.append(arr_signal)
            bin_labels.append(flag_value)

        if not valid:
            continue

        X_sample = np.stack(bin_slices, axis=0).T  # (1500, 8)
        y_sample = np.array(bin_labels, dtype=int)  # (8,)

        X_case_all.append(X_sample)
        y_case_all.append(y_sample)

    return X_case_all, y_case_all

# ===============================
# 3. assembling X and y for train/test
# ===============================
X_train_list, y_train_list = [], []
X_test_list,  y_test_list  = [], []

for case in train_cases:
    df_case = df[df['data'] == case].sort_values('Time').reset_index(drop=True)
    Xc, yc = build_samples_for_case(df_case)
    X_train_list.extend(Xc)
    y_train_list.extend(yc)

for case in test_cases:
    df_case = df[df['data'] == case].sort_values('Time').reset_index(drop=True)
    Xc, yc = build_samples_for_case(df_case)
    X_test_list.extend(Xc)
    y_test_list.extend(yc)

if not X_train_list or not X_test_list:
    raise RuntimeError("after building the dataset, nothing is left!")

X_train = np.stack(X_train_list, axis=0)  # (n_train, 1500, 8)
y_train = np.stack(y_train_list, axis=0)  # (n_train, 8)
X_test  = np.stack(X_test_list, axis=0)   # (n_test, 1500, 8)
y_test  = np.stack(y_test_list, axis=0)   # (n_test, 8)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:",  X_test.shape)
print("y_test shape:",  y_test.shape)

# ===============================
# 4. calculate pos_weight for each of the 8 bins
# ===============================
pos = np.sum(y_train, axis=0)           # (8,) — count of 1
neg = y_train.shape[0] - pos            # (8,) — count of 0
pos = np.maximum(pos, 1)
neg = np.maximum(neg, 1)
pos_weight = neg / pos                  # (8,)
pos_weight_tf = tf.constant(pos_weight, dtype=tf.float32)

# ===============================
# 5. Custom weighted BCE
# ===============================
def weighted_bce(y_true, y_pred_logits):
    y_true_f = tf.cast(y_true, tf.float32)
    logits_f = tf.cast(y_pred_logits, tf.float32)
    loss_per_entry = tf.nn.weighted_cross_entropy_with_logits(
        labels=y_true_f,
        logits=logits_f,
        pos_weight=pos_weight_tf
    )
    return tf.reduce_mean(loss_per_entry, axis=-1)

# ===============================
# 6. build the model (last layer without activation)
# ===============================
model = Sequential([
    Conv1D(128, 5, padding='same', activation='relu', input_shape=(1500, 8)),
    BatchNormalization(),
    MaxPooling1D(2),
    Dropout(0.2),

    Conv1D(256, 3, padding='same', activation='relu'),
    BatchNormalization(),
    MaxPooling1D(2),
    Dropout(0.2),

    LSTM(64, return_sequences=False),
    Dropout(0.2),

    Dense(8, activation=None)  # returns logits for the 8 bins
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss=weighted_bce,
    metrics=['accuracy']
)
model.summary()

# ===============================
# 7. training
# ===============================
early_stopping = EarlyStopping(
    monitor='val_loss', patience=10, verbose=1, restore_best_weights=True
)
checkpoint = ModelCheckpoint(
    filepath='experiment_1.h5',
    monitor='val_loss', save_best_only=True, verbose=1
)

history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=16,
    validation_split=0.1,
    callbacks=[early_stopping, checkpoint],
    verbose=1
)

# ===============================
# 8. evaluation of overall accuracy (all bins together)
# ===============================
# get logits and convert them to probabilities
y_pred_logits = model.predict(X_test)            # (n_test, 8)
y_pred_prob   = tf.sigmoid(y_pred_logits).numpy()  # (n_test, 8)
y_pred_bin    = (y_pred_prob >= 0.5).astype(int)   # binary predictions

y_true = y_test.astype(int)

# 1) exact match: all 8 bits simultaneously
exact_match = np.all(y_pred_bin == y_true, axis=1)
exact_match_accuracy = np.mean(exact_match)
print(f"\nExact match accuracy: {exact_match_accuracy:.4f}")

# 2) percentage of correctly predicted bits overall
flat_true = y_true.flatten()
flat_pred = y_pred_bin.flatten()
bitwise_accuracy = accuracy_score(flat_true, flat_pred)
print(f"Bitwise accuracy : {bitwise_accuracy:.4f}")

# 3) MCC and classification_report across all bits combined (flattened)

mcc_overall = matthews_corrcoef(flat_true, flat_pred)
print(f"Overall MCC: {mcc_overall:.4f}\n")

print("Classification report:")
print(classification_report(flat_true, flat_pred, zero_division=0))


X_train shape: (258, 1500, 8)
y_train shape: (258, 8)
X_test shape: (66, 1500, 8)
y_test shape: (66, 8)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 222ms/step - accuracy: 0.0743 - loss: 1.0782
Epoch 1: val_loss improved from inf to 1.14820, saving model to model_8bins_weighted.h5




[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 259ms/step - accuracy: 0.0734 - loss: 1.0783 - val_accuracy: 0.1923 - val_loss: 1.1482
Epoch 2/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 213ms/step - accuracy: 0.1322 - loss: 1.0683
Epoch 2: val_loss did not improve from 1.14820
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 227ms/step - accuracy: 0.1339 - loss: 1.0678 - val_accuracy: 0.0769 - val_loss: 1.1497
Epoch 3/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 211ms/step - accuracy: 0.1025 - loss: 1.0523
Epoch 3: val_loss did not improve from 1.14820
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 225ms/step - accuracy: 0.1026 - loss: 1.0522 - val_accuracy: 0.2308 - val_loss: 1.1505
Epoch 4/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 209ms/step - accuracy: 0.0987 - loss: 1.0471
Epoch 4: val_loss did not improve from 1.14820
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m



[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 226ms/step - accuracy: 0.1767 - loss: 1.0403 - val_accuracy: 0.0769 - val_loss: 1.0645
Epoch 10/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 214ms/step - accuracy: 0.1820 - loss: 0.9487
Epoch 10: val_loss improved from 1.06448 to 1.05672, saving model to model_8bins_weighted.h5




[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 232ms/step - accuracy: 0.1814 - loss: 0.9527 - val_accuracy: 0.0385 - val_loss: 1.0567
Epoch 11/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 214ms/step - accuracy: 0.1475 - loss: 1.0143
Epoch 11: val_loss did not improve from 1.05672
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 229ms/step - accuracy: 0.1464 - loss: 1.0138 - val_accuracy: 0.0385 - val_loss: 1.0838
Epoch 12/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 209ms/step - accuracy: 0.1605 - loss: 1.0030
Epoch 12: val_loss did not improve from 1.05672
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 224ms/step - accuracy: 0.1604 - loss: 1.0026 - val_accuracy: 0.0769 - val_loss: 1.0981
Epoch 13/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 205ms/step - accuracy: 0.2018 - loss: 0.9755
Epoch 13: val_loss did not improve from 1.05672
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━

In [12]:
# ===============================
# 9. search for the optimal threshold by MCC
# ===============================
# use y_pred_prob and y_true from the previous code

from sklearn.metrics import matthews_corrcoef

# convert y_test probabilities into 1D arrays for MCC calculation


flat_true = y_true.flatten()
best_mcc = -1.0
best_thresh = 0.0

# iterate over thresholds from 0.0 to 1.0 with a step of 0.01
for thresh in np.linspace(0, 1, 101):
    y_pred_bin_thresh = (y_pred_prob >= thresh).astype(int).flatten()
    mcc = matthews_corrcoef(flat_true, y_pred_bin_thresh)
    if mcc > best_mcc:
        best_mcc = mcc
        best_thresh = thresh

print(f"best threshold by MCC: {best_thresh:.2f}, MCC = {best_mcc:.4f}")

# optionally — output classification_report for the optimal threshold
y_pred_optimal = (y_pred_prob >= best_thresh).astype(int).flatten()
print("\nClassification report for optimal threshold:")
print(classification_report(flat_true, y_pred_optimal, zero_division=0))


best threshold by MCC: 0.46, MCC = 0.2508

Classification report for optimal threshold:
              precision    recall  f1-score   support

           0       0.99      0.28      0.43       427
           1       0.24      0.99      0.39       101

    accuracy                           0.41       528
   macro avg       0.62      0.63      0.41       528
weighted avg       0.85      0.41      0.42       528



In [11]:
exact_match = np.all(y_pred_bin == y_true, axis=1)
exact_match_accuracy = np.mean(exact_match)
print(f"\nExact match accuracy (усі 8 бітів одночасно): {exact_match_accuracy:.4f}")

# 2) Відсоток правильно передбачених бітів загалом
flat_true = y_true.flatten()
flat_pred = y_pred_bin.flatten()
bitwise_accuracy = accuracy_score(flat_true, flat_pred)
print(f"Bitwise accuracy (усі біти загалом): {bitwise_accuracy:.4f}")



Exact match accuracy (усі 8 бітів одночасно): 0.0000
Bitwise accuracy (усі біти загалом): 0.4905
