In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import matthews_corrcoef, classification_report, accuracy_score

# ===============================
# 1. loading data and initial processing
# ===============================
df = pd.read_csv('uniq2_table.csv')

required_cols = [
    'data', 'Time',
    'peak_usa', 'peak_germany', 'peak_australia',
    'peak_italy', 'peak_iceland', 'peak_uk',
    'bin_flag_peak_usa', 'bin_flag_peak_germany', 'bin_flag_peak_australia',
    'bin_flag_peak_italy', 'bin_flag_peak_iceland', 'bin_flag_peak_uk'
]
assert all(col in df.columns for col in required_cols), "Не вистачає необхідних стовпців!"
df.dropna(subset=required_cols, inplace=True)

cases = df['data'].unique()
train_cases, test_cases = train_test_split(cases, test_size=0.2, random_state=42)

# ===============================
# 2. function for creating X and y 
# ===============================
def build_samples_for_case(df_case):
    station_list = [
        'peak_usa', 'peak_germany', 'peak_australia',
        'peak_italy', 'peak_iceland', 'peak_uk'
    ]
    binflag_list = [
        'bin_flag_peak_usa', 'bin_flag_peak_germany', 'bin_flag_peak_australia',
        'bin_flag_peak_italy', 'bin_flag_peak_iceland', 'bin_flag_peak_uk'
    ]
    bins = np.arange(0, 241, 30)  # 0,30,60,...,240
    df_case = df_case.copy()
    df_case['time_bin'] = pd.cut(df_case['Time'], bins=bins, right=False)
    bins_intervals = df_case['time_bin'].cat.categories

    X_case_all = []
    y_case_all = []

    for station, flag_col in zip(station_list, binflag_list):
        bin_slices = []
        bin_labels = []
        valid = True

        for interval in bins_intervals:
            mask = (df_case['time_bin'] == interval)
            arr_signal = df_case.loc[mask, station].values
            arr_flag = df_case.loc[mask, flag_col].values

            count = arr_signal.shape[0]
            if count == 0:
                valid = False
                break

            if count < 1500:
                last_val = arr_signal[-1]
                padding = np.full((1500 - count,), last_val, dtype=arr_signal.dtype)
                arr_signal = np.concatenate([arr_signal, padding])
                flag_value = int(arr_flag[0]) if arr_flag.size > 0 else 0
            else:
                arr_signal = arr_signal[:1500]
                flag_value = int(arr_flag[0])

            bin_slices.append(arr_signal)
            bin_labels.append(flag_value)

        if not valid:
            continue

        X_sample = np.stack(bin_slices, axis=0).T  # (1500, 8)
        y_sample = np.array(bin_labels, dtype=int)  # (8,)

        X_case_all.append(X_sample)
        y_case_all.append(y_sample)

    return X_case_all, y_case_all

# ===============================
# 3. assembling X and y for train/test
# ===============================
X_train_list, y_train_list = [], []
X_test_list,  y_test_list  = [], []

for case in train_cases:
    df_case = df[df['data'] == case].sort_values('Time').reset_index(drop=True)
    Xc, yc = build_samples_for_case(df_case)
    X_train_list.extend(Xc)
    y_train_list.extend(yc)

for case in test_cases:
    df_case = df[df['data'] == case].sort_values('Time').reset_index(drop=True)
    Xc, yc = build_samples_for_case(df_case)
    X_test_list.extend(Xc)
    y_test_list.extend(yc)

if not X_train_list or not X_test_list:
    raise RuntimeError("after building the dataset, nothing is left!")

X_train = np.stack(X_train_list, axis=0)  # (n_train, 1500, 8)
y_train = np.stack(y_train_list, axis=0)  # (n_train, 8)
X_test  = np.stack(X_test_list, axis=0)   # (n_test, 1500, 8)
y_test  = np.stack(y_test_list, axis=0)   # (n_test, 8)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:",  X_test.shape)
print("y_test shape:",  y_test.shape)

# ===============================
#  4. calculate pos_weight for each of the 8 bins
# ===============================
#Simply specify the desired class weight ratio for 0 and 1:
#for example, if you want the positive class to have a weight of 5.0 and the negative class to have 1.0, set:
WEIGHT_POS = 3.0 # weight for cases where y_true == 1
WEIGHT_NEG = 1.0 # weight for cases where y_true == 0

#pos_weight_tf will be a scalar (weight for logits of "1")
#In tf.nn.weighted_cross_entropy_with_logits, pos_weight defines
#how much more the model penalizes errors when y_true=1 (false negatives).
pos_weight_tf = tf.constant(WEIGHT_POS, dtype=tf.float32)


# ===============================
# 5. Custom weighted BCE
# ===============================
def weighted_bce_manual(y_true, y_pred_logits):
    """
Use WEIGHT_POS as the weight for cases where y_true=1.
y_true: (batch_size, 8)
y_pred_logits: (batch_size, 8)
"""
    y_true_f = tf.cast(y_true, tf.float32)
    logits_f = tf.cast(y_pred_logits, tf.float32)

    # tf.nn.weighted_cross_entropy_with_logits takes pos_weight (scalar)
    # and automatically treats the weight for y_true=0 as 1.
    loss_per_entry = tf.nn.weighted_cross_entropy_with_logits(
        labels=y_true_f,
        logits=logits_f,
        pos_weight=pos_weight_tf
    )
    #loss_per_entry shape = (batch_size, 8)
    #averaging over 8 bits → we get the loss per batch row:
    return tf.reduce_mean(loss_per_entry, axis=-1)

# ===============================
# 6. Building the model (last layer without activation)
# ===============================
model = Sequential([
    Conv1D(128, 5, padding='same', activation='relu', input_shape=(1500, 8)),
    BatchNormalization(),
    MaxPooling1D(2),
    Dropout(0.2),

    Conv1D(256, 3, padding='same', activation='relu'),
    BatchNormalization(),
    MaxPooling1D(2),
    Dropout(0.2),

    LSTM(64, return_sequences=False),
    Dropout(0.2),

    Dense(8, activation=None) # returns logits for 8 bits
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss=weighted_bce_manual,
    metrics=['accuracy']
)
model.summary()

# ===============================
# 7. Training
# ===============================
early_stopping = EarlyStopping(
    monitor='val_loss', patience=10, verbose=1, restore_best_weights=True
)
checkpoint = ModelCheckpoint(
    filepath='experiment_3.h5',
    monitor='val_loss', save_best_only=True, verbose=1
)

history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=16,
    validation_split=0.1,
    callbacks=[early_stopping, checkpoint],
    verbose=1
)

# ===============================
# 8. evaluation of overall accuracy (all bits together)
# ===============================
y_pred_logits = model.predict(X_test)             # (n_test, 8)
y_pred_prob   = tf.sigmoid(y_pred_logits).numpy()  # (n_test, 8)
y_pred_bin    = (y_pred_prob >= 0.5).astype(int)   # binary predictions

y_true = y_test.astype(int)

# 1) exact match accuracy: whether all 8 bits are predicted correctly at once
exact_match = np.all(y_pred_bin == y_true, axis=1)
exact_match_accuracy = np.mean(exact_match)
print(f"\nExact match accuracy : {exact_match_accuracy:.4f}")

# 2)  bitwise accuracy (all bits separately)
flat_true = y_true.flatten()
flat_pred = y_pred_bin.flatten()
bitwise_accuracy = accuracy_score(flat_true, flat_pred)
print(f"Bitwise accuracy : {bitwise_accuracy:.4f}")

# 3) overall MCC and classification report across all bits
mcc_overall = matthews_corrcoef(flat_true, flat_pred)
print(f"Overall MCC : {mcc_overall:.4f}\n")
print("Classification report :")
print(classification_report(flat_true, flat_pred, zero_division=0))


X_train shape: (258, 1500, 8)
y_train shape: (258, 8)
X_test shape: (66, 1500, 8)
y_test shape: (66, 8)
Встановлені ваги: для 1 → 3.0, для 0 → 1.0 (негативна вага фактично =1.0)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 214ms/step - accuracy: 0.1201 - loss: 1.0258
Epoch 1: val_loss improved from inf to 1.05805, saving model to model_8bins_manual_weight_proverka.h5




[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 250ms/step - accuracy: 0.1212 - loss: 1.0241 - val_accuracy: 0.1923 - val_loss: 1.0581
Epoch 2/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 212ms/step - accuracy: 0.1268 - loss: 0.9513
Epoch 2: val_loss improved from 1.05805 to 1.05699, saving model to model_8bins_manual_weight_proverka.h5




[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 232ms/step - accuracy: 0.1278 - loss: 0.9532 - val_accuracy: 0.1923 - val_loss: 1.0570
Epoch 3/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 215ms/step - accuracy: 0.1327 - loss: 0.9705
Epoch 3: val_loss improved from 1.05699 to 1.05668, saving model to model_8bins_manual_weight_proverka.h5




[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 235ms/step - accuracy: 0.1325 - loss: 0.9708 - val_accuracy: 0.1923 - val_loss: 1.0567
Epoch 4/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 213ms/step - accuracy: 0.0985 - loss: 0.9950
Epoch 4: val_loss improved from 1.05668 to 1.05610, saving model to model_8bins_manual_weight_proverka.h5




[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 233ms/step - accuracy: 0.0988 - loss: 0.9929 - val_accuracy: 0.1923 - val_loss: 1.0561
Epoch 5/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 210ms/step - accuracy: 0.1034 - loss: 0.9439
Epoch 5: val_loss did not improve from 1.05610
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 225ms/step - accuracy: 0.1064 - loss: 0.9442 - val_accuracy: 0.1923 - val_loss: 1.0581
Epoch 6/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 211ms/step - accuracy: 0.1863 - loss: 0.9180
Epoch 6: val_loss did not improve from 1.05610
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 227ms/step - accuracy: 0.1862 - loss: 0.9191 - val_accuracy: 0.1923 - val_loss: 1.0714
Epoch 7/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 210ms/step - accuracy: 0.2350 - loss: 0.9679
Epoch 7: val_loss did not improve from 1.05610
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m



[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 230ms/step - accuracy: 0.1954 - loss: 0.9608 - val_accuracy: 0.1538 - val_loss: 1.0246
Epoch 15/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 208ms/step - accuracy: 0.1851 - loss: 0.9076
Epoch 15: val_loss improved from 1.02460 to 0.94847, saving model to model_8bins_manual_weight_proverka.h5




[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 227ms/step - accuracy: 0.1849 - loss: 0.9081 - val_accuracy: 0.4231 - val_loss: 0.9485
Epoch 16/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 214ms/step - accuracy: 0.1328 - loss: 0.8833
Epoch 16: val_loss improved from 0.94847 to 0.92980, saving model to model_8bins_manual_weight_proverka.h5




[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 233ms/step - accuracy: 0.1317 - loss: 0.8839 - val_accuracy: 0.0000e+00 - val_loss: 0.9298
Epoch 17/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 207ms/step - accuracy: 0.1799 - loss: 0.9076
Epoch 17: val_loss did not improve from 0.92980
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 222ms/step - accuracy: 0.1781 - loss: 0.9074 - val_accuracy: 0.4231 - val_loss: 1.0750
Epoch 18/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 211ms/step - accuracy: 0.2057 - loss: 0.8400
Epoch 18: val_loss did not improve from 0.92980
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 226ms/step - accuracy: 0.2031 - loss: 0.8423 - val_accuracy: 0.4231 - val_loss: 1.0677
Epoch 19/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 212ms/step - accuracy: 0.1428 - loss: 0.8650
Epoch 19: val_loss did not improve from 0.92980
[1m15/15[0m [32m━━━━━━━━━━━━━━



[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 230ms/step - accuracy: 0.1485 - loss: 0.7922 - val_accuracy: 0.1538 - val_loss: 0.8910
Epoch 24/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 211ms/step - accuracy: 0.1290 - loss: 0.7820
Epoch 24: val_loss did not improve from 0.89101
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 225ms/step - accuracy: 0.1298 - loss: 0.7846 - val_accuracy: 0.1538 - val_loss: 0.8965
Epoch 25/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 211ms/step - accuracy: 0.2129 - loss: 0.8442
Epoch 25: val_loss improved from 0.89101 to 0.86919, saving model to model_8bins_manual_weight_proverka.h5




[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 231ms/step - accuracy: 0.2109 - loss: 0.8446 - val_accuracy: 0.0385 - val_loss: 0.8692
Epoch 26/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 212ms/step - accuracy: 0.1782 - loss: 0.8112
Epoch 26: val_loss did not improve from 0.86919
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 228ms/step - accuracy: 0.1775 - loss: 0.8108 - val_accuracy: 0.1538 - val_loss: 1.1005
Epoch 27/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 210ms/step - accuracy: 0.1767 - loss: 0.7992
Epoch 27: val_loss did not improve from 0.86919
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 225ms/step - accuracy: 0.1762 - loss: 0.7986 - val_accuracy: 0.1538 - val_loss: 1.3642
Epoch 28/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 210ms/step - accuracy: 0.2378 - loss: 0.7572
Epoch 28: val_loss did not improve from 0.86919
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━

In [4]:
# ===============================
# 9. search for the optimal threshold by MCC
# ===============================
# use y_pred_prob and y_true from the previous code
from sklearn.metrics import matthews_corrcoef

# convert y_test probabilities into 1D arrays for MCC calculation
flat_true = y_true.flatten()
best_mcc = -1.0
best_thresh = 0.0

# iterate over thresholds from 0.0 to 1.0 with a step of 0.01
for thresh in np.linspace(0, 1, 101):
    y_pred_bin_thresh = (y_pred_prob >= thresh).astype(int).flatten()
    mcc = matthews_corrcoef(flat_true, y_pred_bin_thresh)
    if mcc > best_mcc:
        best_mcc = mcc
        best_thresh = thresh

print(f"best threshold by MCC: {best_thresh:.2f}, MCC = {best_mcc:.4f}")


y_pred_optimal = (y_pred_prob >= best_thresh).astype(int).flatten()
print("\nClassification report for optimal threshold:")
print(classification_report(flat_true, y_pred_optimal, zero_division=0))


Найкращий поріг за MCC: 0.35, MCC = 0.3592

Classification report for optimal threshold:
              precision    recall  f1-score   support

           0       0.99      0.45      0.62       427
           1       0.30      0.99      0.46       101

    accuracy                           0.55       528
   macro avg       0.65      0.72      0.54       528
weighted avg       0.86      0.55      0.59       528

