In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import matthews_corrcoef, classification_report, accuracy_score

# ===============================
# 1. loading data and initial processing
# ===============================
df = pd.read_csv('uniq2_table.csv')

required_cols = [
    'data', 'Time',
    'peak_usa', 'peak_germany', 'peak_australia',
    'peak_italy', 'peak_iceland', 'peak_uk',
    'bin_flag_peak_usa', 'bin_flag_peak_germany', 'bin_flag_peak_australia',
    'bin_flag_peak_italy', 'bin_flag_peak_iceland', 'bin_flag_peak_uk'
]
assert all(col in df.columns for col in required_cols), "Не вистачає необхідних стовпців!"
df.dropna(subset=required_cols, inplace=True)

cases = df['data'].unique()
train_cases, test_cases = train_test_split(cases, test_size=0.2, random_state=42)

# ===============================
# 2. function for creating X and y 
# ===============================
def build_samples_for_case(df_case):
    station_list = [
        'peak_usa', 'peak_germany', 'peak_australia',
        'peak_italy', 'peak_iceland', 'peak_uk'
    ]
    binflag_list = [
        'bin_flag_peak_usa', 'bin_flag_peak_germany', 'bin_flag_peak_australia',
        'bin_flag_peak_italy', 'bin_flag_peak_iceland', 'bin_flag_peak_uk'
    ]
    bins = np.arange(0, 241, 30)  # 0,30,60,...,240
    df_case = df_case.copy()
    df_case['time_bin'] = pd.cut(df_case['Time'], bins=bins, right=False)
    bins_intervals = df_case['time_bin'].cat.categories

    X_case_all = []
    y_case_all = []

    for station, flag_col in zip(station_list, binflag_list):
        bin_slices = []
        bin_labels = []
        valid = True

        for interval in bins_intervals:
            mask = (df_case['time_bin'] == interval)
            arr_signal = df_case.loc[mask, station].values
            arr_flag = df_case.loc[mask, flag_col].values

            count = arr_signal.shape[0]
            if count == 0:
                valid = False
                break

            if count < 1500:
                last_val = arr_signal[-1]
                padding = np.full((1500 - count,), last_val, dtype=arr_signal.dtype)
                arr_signal = np.concatenate([arr_signal, padding])
                flag_value = int(arr_flag[0]) if arr_flag.size > 0 else 0
            else:
                arr_signal = arr_signal[:1500]
                flag_value = int(arr_flag[0])

            bin_slices.append(arr_signal)
            bin_labels.append(flag_value)

        if not valid:
            continue

        X_sample = np.stack(bin_slices, axis=0).T  # (1500, 8)
        y_sample = np.array(bin_labels, dtype=int)  # (8,)

        X_case_all.append(X_sample)
        y_case_all.append(y_sample)

    return X_case_all, y_case_all

# ===============================
# 3. assembling X and y for train/test
# ===============================
X_train_list, y_train_list = [], []
X_test_list,  y_test_list  = [], []

for case in train_cases:
    df_case = df[df['data'] == case].sort_values('Time').reset_index(drop=True)
    Xc, yc = build_samples_for_case(df_case)
    X_train_list.extend(Xc)
    y_train_list.extend(yc)

for case in test_cases:
    df_case = df[df['data'] == case].sort_values('Time').reset_index(drop=True)
    Xc, yc = build_samples_for_case(df_case)
    X_test_list.extend(Xc)
    y_test_list.extend(yc)

if not X_train_list or not X_test_list:
    raise RuntimeError("after building the dataset, nothing is left!")

X_train = np.stack(X_train_list, axis=0)  # (n_train, 1500, 8)
y_train = np.stack(y_train_list, axis=0)  # (n_train, 8)
X_test  = np.stack(X_test_list, axis=0)   # (n_test, 1500, 8)
y_test  = np.stack(y_test_list, axis=0)   # (n_test, 8)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:",  X_test.shape)
print("y_test shape:",  y_test.shape)

# ===============================
# 4. calculate pos_weight for each of the 8 bins
# ===============================
flat_true_train = y_train.flatten()          # vector of length n_train*8
total_pos = np.sum(flat_true_train)          # count of 1
total_neg = flat_true_train.size - total_pos # count of 0

# in case total_pos or total_neg = 0
total_pos = max(total_pos, 1)
total_neg = max(total_neg, 1)

pos_weight_scalar = total_neg / total_pos
pos_weight_tf = tf.constant(pos_weight_scalar, dtype=tf.float32)

print(f"Total positives: {total_pos}, Total negatives: {total_neg}, pos_weight: {pos_weight_scalar:.4f}")

# ===============================
# 5. Custom weighted BCE
# ===============================
def weighted_bce_universal(y_true, y_pred_logits):
   """
Uses a single scalar pos_weight_tf for ALL 8 bits:
- y_true: (batch_size, 8)
- y_pred_logits: (batch_size, 8)
Returns a loss vector with shape=(batch_size,)
"""
    y_true_f   = tf.cast(y_true, tf.float32)
    logits_f   = tf.cast(y_pred_logits, tf.float32)
    # tf.nn.weighted_cross_entropy_with_logits broadcasts the scalar pos_weight_tf to shape (batch, 8)
    loss_per_entry = tf.nn.weighted_cross_entropy_with_logits(
        labels=y_true_f,
        logits=logits_f,
        pos_weight=pos_weight_tf
    )
    # shape(loss_per_entry) = (batch_size, 8)
    return tf.reduce_mean(loss_per_entry, axis=-1)

# ===============================
# 6. build the model (last layer without activation)
# ===============================
model = Sequential([
    Conv1D(128, 5, padding='same', activation='relu', input_shape=(1500, 8)),
    BatchNormalization(),
    MaxPooling1D(2),
    Dropout(0.2),

    Conv1D(256, 3, padding='same', activation='relu'),
    BatchNormalization(),
    MaxPooling1D(2),
    Dropout(0.2),

    LSTM(64, return_sequences=False),
    Dropout(0.2),

    Dense(8, activation=None)  # returns logits for 8 bits
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss=weighted_bce_universal,
    metrics=['accuracy']
)
model.summary()

# ===============================
# 7. training
# ===============================
early_stopping = EarlyStopping(
    monitor='val_loss', patience=10, verbose=1, restore_best_weights=True
)
checkpoint = ModelCheckpoint(
    filepath='experiment_2.h5',
    monitor='val_loss', save_best_only=True, verbose=1
)

history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=16,
    validation_split=0.1,
    callbacks=[early_stopping, checkpoint],
    verbose=1
)

# ===============================
# 8. evaluation of overall accuracy (all bits together)
# ===============================
# get logits and convert to probabilities
y_pred_logits = model.predict(X_test)             # (n_test, 8)
y_pred_prob   = tf.sigmoid(y_pred_logits).numpy()  # (n_test, 8)
y_pred_bin    = (y_pred_prob >= 0.5).astype(int)   # binary predictions

y_true = y_test.astype(int)

# 1) exact match accuracy: whether all 8 bits are predicted correctly at once
exact_match = np.all(y_pred_bin == y_true, axis=1)
exact_match_accuracy = np.mean(exact_match)
print(f"\nExact match accuracy: {exact_match_accuracy:.4f}")

# 2) bitwise accuracy (all bits separately)
flat_true = y_true.flatten()
flat_pred = y_pred_bin.flatten()
bitwise_accuracy = accuracy_score(flat_true, flat_pred)
print(f"Bitwise accuracy: {bitwise_accuracy:.4f}")

# 3) overall MCC and classification report across all bits


mcc_overall = matthews_corrcoef(flat_true, flat_pred)
print(f"Overall MCC: {mcc_overall:.4f}\n")
print("Classification report:")
print(classification_report(flat_true, flat_pred, zero_division=0))


2025-06-01 12:26:01.694867: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


X_train shape: (258, 1500, 8)
y_train shape: (258, 8)
X_test shape: (66, 1500, 8)
y_test shape: (66, 8)
Total positives: 466, Total negatives: 1598, pos_weight: 3.4292


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 204ms/step - accuracy: 0.1130 - loss: 1.0804
Epoch 1: val_loss improved from inf to 1.13840, saving model to model_8bins_universal_weight.h5




[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 245ms/step - accuracy: 0.1146 - loss: 1.0807 - val_accuracy: 0.2308 - val_loss: 1.1384
Epoch 2/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 212ms/step - accuracy: 0.1392 - loss: 1.0469
Epoch 2: val_loss did not improve from 1.13840
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 227ms/step - accuracy: 0.1383 - loss: 1.0471 - val_accuracy: 0.2308 - val_loss: 1.1398
Epoch 3/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 213ms/step - accuracy: 0.1310 - loss: 1.0432
Epoch 3: val_loss improved from 1.13840 to 1.13826, saving model to model_8bins_universal_weight.h5




[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 231ms/step - accuracy: 0.1320 - loss: 1.0428 - val_accuracy: 0.1538 - val_loss: 1.1383
Epoch 4/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 212ms/step - accuracy: 0.1297 - loss: 1.0434
Epoch 4: val_loss did not improve from 1.13826
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 227ms/step - accuracy: 0.1302 - loss: 1.0427 - val_accuracy: 0.4615 - val_loss: 1.1406
Epoch 5/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 209ms/step - accuracy: 0.1783 - loss: 0.9989
Epoch 5: val_loss did not improve from 1.13826
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 224ms/step - accuracy: 0.1777 - loss: 1.0001 - val_accuracy: 0.1538 - val_loss: 1.1396
Epoch 6/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 214ms/step - accuracy: 0.1810 - loss: 0.9967
Epoch 6: val_loss did not improve from 1.13826
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m



[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 233ms/step - accuracy: 0.1654 - loss: 0.9613 - val_accuracy: 0.1154 - val_loss: 1.1338
Epoch 11/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 219ms/step - accuracy: 0.2054 - loss: 0.9229
Epoch 11: val_loss improved from 1.13376 to 1.04470, saving model to model_8bins_universal_weight.h5




[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 238ms/step - accuracy: 0.2044 - loss: 0.9246 - val_accuracy: 0.0385 - val_loss: 1.0447
Epoch 12/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 212ms/step - accuracy: 0.2006 - loss: 0.9258
Epoch 12: val_loss did not improve from 1.04470
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 228ms/step - accuracy: 0.2002 - loss: 0.9269 - val_accuracy: 0.1154 - val_loss: 1.1602
Epoch 13/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 213ms/step - accuracy: 0.2155 - loss: 0.9516
Epoch 13: val_loss did not improve from 1.04470
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 227ms/step - accuracy: 0.2160 - loss: 0.9510 - val_accuracy: 0.0000e+00 - val_loss: 1.0831
Epoch 14/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 222ms/step - accuracy: 0.2099 - loss: 1.0022
Epoch 14: val_loss did not improve from 1.04470
[1m15/15[0m [32m━━━━━━━━━━━━━━

In [2]:
# ===============================
# 9. search for the optimal threshold by MCC
# ===============================
# use y_pred_prob and y_true from the previous code

from sklearn.metrics import matthews_corrcoef

# convert y_test probabilities into 1D arrays for MCC calculation
flat_true = y_true.flatten()
best_mcc = -1.0
best_thresh = 0.0

# iterate over thresholds from 0.0 to 1.0 with a step of 0.01
for thresh in np.linspace(0, 1, 101):
    y_pred_bin_thresh = (y_pred_prob >= thresh).astype(int).flatten()
    mcc = matthews_corrcoef(flat_true, y_pred_bin_thresh)
    if mcc > best_mcc:
        best_mcc = mcc
        best_thresh = thresh

print(f"best threshold by MCC: {best_thresh:.2f}, MCC = {best_mcc:.4f}")


y_pred_optimal = (y_pred_prob >= best_thresh).astype(int).flatten()
print("\nClassification report for optimal threshold:")
print(classification_report(flat_true, y_pred_optimal, zero_division=0))


Найкращий поріг за MCC: 0.51, MCC = 0.2959

Classification report for optimal threshold:
              precision    recall  f1-score   support

           0       0.87      0.85      0.86       427
           1       0.42      0.46      0.44       101

    accuracy                           0.77       528
   macro avg       0.64      0.65      0.65       528
weighted avg       0.78      0.77      0.78       528

