# Setup

## Install

In [None]:
!pip install awswrangler
!pip install tensorflow

## Import

In [None]:
import pandas as pd
import awswrangler as wr
import boto3
import numpy as np
import matplotlib.pyplot as plt
from getpass import getpass
import os

In [None]:
import tensorflow as tf

In [None]:
from transformers import AutoTokenizer, TFBertModel, create_optimizer

# Set this environment variable regarding legacy keras to be safe
# Explanation: Transformers package uses Keras 2 objects, current version is Keras 3, packed in Tensorflow since version 2.16. Fastest fix without downgrading tensorflow is to set legacy keras usage flag as above. More info can be found here.
# https://stackoverflow.com/questions/79309854/valueerror-exception-encountered-when-calling-layer-tf-bert-model-type-tfber
os.environ["TF_USE_LEGACY_KERAS"] = "1"

# import legacy keras
import tf_keras
from tf_keras.layers import Dense, Input
from tf_keras.models import Model


In [None]:
print(tf_keras.__version__)
version_str = tf_keras.__version__
major_version = int(version_str.split('.')[0])

if major_version >= 3:
    raise ValueError(f"Keras version is {version_str}, which is 3.0 or higher. This setup requires Keras version < 3.0.")
else:
    print(f"Keras version {version_str} is compatible.")

## Constants

In [None]:
# --- IMPORTANT: Set these variables before running ---
AWS_REGION = 'us-east-2'
S3_STAGING_DIR = 's3://cs230-market-data-2025/athena-query-results/'
ATHENA_DB = 'cs230_finance_data'
# Querying more data for a small training run
SQL_QUERY = "SELECT concatarticles1, concatarticles2, vol_1_vs_2 FROM paired_vixy_w_titles_v3 WHERE vol_1_vs_2 is not null ORDER BY RAND() LIMIT 10000"

LABEL_COLUMN = 'vol_1_vs_2'
# Columns:
# concatarticles1: concatenated 10 titles (first of pair)
# date1: articles release date (first of pair)
# vol_diff1: percet change of volatility between tomorrow open and yesterday close
# concatarticles2 : ditto (second or pair)
# date2: ditto (second or pair)
# vol_diff2 : ditto (second or pair)
# vol_1_vs_2 : vol_diff2 - vol_diff1, paired label

# --- Model & Tokenizer Configuration ---
MODEL_NAME = "ProsusAI/finbert"
MAX_LENGTH = 256
VAL_RATIO = 0.2
# ----------------------------------------------------


## Connect AWS

In [None]:
# --- AWS Authentication for Colab ---
# Prompt for AWS credentials
aws_access_key_id = getpass('Enter AWS Access Key ID: ')
aws_secret_access_key = getpass('Enter AWS Secret Access Key: ')


## TPU config

In [None]:
# --- TPU Configuration ---
print("\n--- Step 2: Configuring TPU ---")
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
except ValueError:
    # If TPU is not available, check for GPU.
    print('⚠️ TPU not found. Checking for GPUs.')
    if tf.config.list_physical_devices('GPU'):
        # If GPUs are available, MirroredStrategy will use them all.
        # If only one GPU is available, it will use that one.
        strategy = tf.distribute.MirroredStrategy()
        print(f'✅ Running on {len(tf.config.list_physical_devices("GPU"))} GPU(s).')
    else:
        # If no GPU is found, fall back to CPU
        print('⚠️ No GPUs found. Running on CPU.')
        strategy = tf.distribute.get_strategy() # Default strategy for CPU

print(f"REPLICAS: {strategy.num_replicas_in_sync}")


# Preprocessing

## Read data from aws

In [None]:


print(f"\n--- Step 3: Configuration set for {ATHENA_DB} ---")
print(f"--- Step 4: Querying Data ---")
print(f"Querying data from {ATHENA_DB}....")

# Define df in a wider scope
df = None

try:
    # Create a boto3 session with the provided credentials
    session = boto3.Session(
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        region_name=AWS_REGION,
    )

    # Run the query and load results into a Pandas DataFrame
    df = wr.athena.read_sql_query(
        sql=SQL_QUERY,
        database=ATHENA_DB,
        s3_output=S3_STAGING_DIR,
        boto3_session=session,
    )

    print("\nQuery successful! Data loaded into DataFrame.")

    # Display the first 5 rows
    print(df.head())

except Exception as e:
    print(f"\nAn error occurred:")
    print(e)

## Tokenizer

In [None]:
# ---------------------------------
# Load Tokenizer
# ---------------------------------
print(f"\n--- Step 5: Loading Tokenizer ({MODEL_NAME}) ---")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# ---------------------------------
# Preprocess & Tokenize Data
# ---------------------------------
print(f"\n--- Step 6: Tokenizing Data ---")
if df is not None:
    # Separate the text columns and the label
    text1_list = df['concatarticles1'].astype(str).tolist()
    text2_list = df['concatarticles2'].astype(str).tolist()
    # Ensure labels are float32 for the loss function
    labels = df[LABEL_COLUMN].astype('float32').values

    # Ensure labels are float32
    # raw_labels = df[LABEL_COLUMN].astype('float32').values

    # # --- Normalize Labels ---
    # print("\nNormalizing labels...")
    # LABEL_MEAN = raw_labels.mean()
    # LABEL_STD = raw_labels.std()
    # print(f"Original Label Stats: Mean={LABEL_MEAN:.2f}, Std={LABEL_STD:.2f}")

    # # Standardize to Mean 0, Std 1
    # labels = (raw_labels - LABEL_MEAN) / (LABEL_STD + 1e-8)
    # print(f"Normalized Label Stats: Mean={labels.mean():.2f}, Std={labels.std():.2f}")

    # Tokenize both text lists
    print("Tokenizing concatarticles1...")
    encodings1 = tokenizer(
        text1_list,
        truncation=True,
        padding='max_length',
        max_length=MAX_LENGTH,
        return_tensors='tf'
    )

    print("Tokenizing concatarticles2...")
    encodings2 = tokenizer(
        text2_list,
        truncation=True,
        padding='max_length',
        max_length=MAX_LENGTH,
        return_tensors='tf'
    )

    # Create the input dictionary for the Keras model
    # This format matches the Input layers we will define
    X_train = {
        'input_ids_1': encodings1['input_ids'],
        'attention_mask_1': encodings1['attention_mask'],
        'input_ids_2': encodings2['input_ids'],
        'attention_mask_2': encodings2['attention_mask']
    }
    y_train = labels

    print(f"Data prepared: {len(y_train)} pairs.")

else:
    print("\nDataFrame is None. Halting script.")
    # In a real script, exit here
    # sys.exit()



## Split train / val sets
TODO: Save splitted train / val sets in google drive

In [None]:
split_idx = int(len(y_train) * (1 - VAL_RATIO))

print(f"Splitting validation data from index {split_idx} to {len(y_train)}...")

# Slice the dictionary inputs for validation
X_val = {
    'input_ids_1': X_train['input_ids_1'][split_idx:],
    'attention_mask_1': X_train['attention_mask_1'][split_idx:],
    'input_ids_2': X_train['input_ids_2'][split_idx:],
    'attention_mask_2': X_train['attention_mask_2'][split_idx:]
}
# Slice the labels for validation
y_val = y_train[split_idx:]

# Create training sets without the validation data
X_train_no_val = {
    'input_ids_1': X_train['input_ids_1'][:split_idx],
    'attention_mask_1': X_train['attention_mask_1'][:split_idx],
    'input_ids_2': X_train['input_ids_2'][:split_idx],
    'attention_mask_2': X_train['attention_mask_2'][:split_idx]
}
y_train_no_val = y_train[:split_idx]

# Model

## Check keras version

In [None]:
print(tf_keras.__version__)
version_str = tf_keras.__version__
major_version = int(version_str.split('.')[0])

if major_version >= 3:
    raise ValueError(f"Keras version is {version_str}, which is 3.0 or higher. This setup requires Keras version < 3.0.")
else:
    print(f"Keras version {version_str} is compatible.")

## Hyper params

In [None]:
num_layers_to_unfreeze = 2

# Set up epochs and steps
epochs = 1
batch_size = 16 # Set this so that it fits on the GPU

# Correctly calculate train data size using the labels array
train_data_size = len(y_train_no_val)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

## Model Definition

In [None]:
def create_regression_model(max_len=256):
    # Load the model FRESH every time we create it to avoid stale state
    bert_only = TFBertModel.from_pretrained("ProsusAI/finbert", from_pt=True)

    # --- Freeze Layers ---
    bert_only.trainable = True

    num_bert_layers = len(bert_only.bert.encoder.layer)
    num_layers_to_freeze = num_bert_layers - num_layers_to_unfreeze

    for i, layer in enumerate(bert_only.bert.encoder.layer):
        if i < num_layers_to_freeze:
            layer.trainable = False
        else:
            layer.trainable = True

    # --- ADD INPUT HERE (Using tf_keras) ---
    input_ids_1 = Input(shape=(max_len,), dtype=tf.int32, name='input_ids_1')
    attention_mask_1 = Input(shape=(max_len,), dtype=tf.int32, name='attention_mask_1')

    input_ids_2 = Input(shape=(max_len,), dtype=tf.int32, name='input_ids_2')
    attention_mask_2 = Input(shape=(max_len,), dtype=tf.int32, name='attention_mask_2')
    # --- Tower 1 ---
    output_1 = bert_only(input_ids=input_ids_1, attention_mask=attention_mask_1)
    # --- Tower 2 ---
    output_2 = bert_only(input_ids=input_ids_2, attention_mask=attention_mask_2)
    embedding_1 = output_1.pooler_output
    embedding_2 = output_2.pooler_output

    concatenated_embeddings = tf.concat([embedding_1, embedding_2], axis=1)

    # x = Dense(768, activation='relu')(concatenated_embeddings)
    # Add a final regression/classification head
    output = Dense(1, activation='linear')(concatenated_embeddings)

    # Build model using tf_keras.models.Model
    model = Model(inputs=[input_ids_1, attention_mask_1, input_ids_2, attention_mask_2], outputs=output)

    return model

# Instantiate and check
model = create_regression_model(max_len=256)
model.summary()

## Optimizer & Compile

In [None]:
## Creates an optimizer with learning rate schedule, using warmup steps and
## weight decay (AdamWeightDecay)
optimizer, schedule = create_optimizer(
    init_lr=2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)

## Use sparse when the classes are not one hot encoded
# metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]
# loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False) # False when the output is a probability, like when using softmax
model.compile(
        loss=tf.keras.losses.MeanSquaredError(),
        optimizer=optimizer
    )

## Optional: save init weights of bert

In [None]:
# 1. Initialize an empty global dictionary
initial_bert_weights_dict = {}

print("\n--- Storing Initial BERT Encoder Layer Weights ---")

# Find the BERT layer in the Functional model
# We look for the layer that is the TFBertModel (usually contains 'bert' in name)
bert_layer_object = None
for layer in model.layers:
    if 'bert' in layer.name:
        bert_layer_object = layer
        break

if bert_layer_object:
    # 2. Iterate through each layer of the BERT encoder
    # Structure: TFBertModel -> .bert (MainLayer) -> .encoder -> .layer (list of TFBertLayer)
    for i, encoder_layer in enumerate(bert_layer_object.bert.encoder.layer):
        # 3. Check if encoder_layer.trainable_variables is not empty
        if len(encoder_layer.trainable_variables) > 0:
            # 4. Store the numpy() array of the first trainable variable
            initial_bert_weights_dict[i] = encoder_layer.trainable_variables[0].numpy().copy()
            print(f"Stored initial weights for BERT Layer {i}: {encoder_layer.trainable_variables[0].name}")
            print(f"BERT layer samples (first 5 values): {initial_bert_weights_dict[i].flatten()[:5]}")
        else:
            print(f"BERT Layer {i} has no trainable variables.")
else:
    print("Error: Could not find a BERT layer in the model.")

# 5. Print a message confirming that the initial weights for each relevant BERT layer have been stored.
print(f"\nInitial weights for {len(initial_bert_weights_dict)} BERT encoder layers stored in 'initial_bert_weights_dict'.")

## Optional: Re-init, in case of re-train

In [None]:
# Re-initialize model to be sure
model = create_regression_model(max_len=256)
# model = create_regression_model(max_len=256)

# Use tf_keras.optimizers.Adam to match the tf_keras model
optimizer = tf_keras.optimizers.Adam(learning_rate=2e-5)

model.compile(
        loss=tf.keras.losses.MeanSquaredError(),
        optimizer=optimizer
    )



##Create Checkpoints

In [None]:
# Create checkpoints
# Checkpoint directory
CHECKPOINT_DIR = os.path.join(BASE_DRIVE_PATH, 'training_checkpoints')
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
checkpoint_path = os.path.join(CHECKPOINT_DIR, 'cp-{epoch:04d}.ckpt')

# Create checkpoints
cp_callback = tf_keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_path,
        save_weights_only=True,  # Set to False to save the entire model (architecture + weights + optimizer state)
        verbose=1,               # Set to 0 for silent, 1 for progress bar, 2 for one line per epoch
        save_best_only=True,     # Save only the best model based on monitor
        monitor='val_loss',      # Metric to monitor (e.g., 'val_loss', 'val_accuracy')
        mode='min',              # 'min' for metrics like loss, 'max' for metrics like accuracy
        save_freq='epoch'        # 'epoch' to save after each epoch, or an integer for number of batches
    )

## Train the model
TODO: Optionally, load the weights from a checkpoint. Use this when Colab kicked you off the vm and a checkpoint was saved.

In [None]:
history = model.fit(
        X_train_no_val,
        y_train_no_val,
        epochs=3,
        batch_size=batch_size,
        validation_data=(X_val, y_val),
        callbacks=[cp_callback]
    )



##  Optional: compare bert weights

In [None]:
import numpy as np
from tensorflow.keras import backend as K

print("\n--- Analyzing BERT Encoder Layer Weights ---\n")

# Find the BERT layer in the Functional model
bert_layer_object = None
for layer in model.layers:
    if 'bert' in layer.name:
        bert_layer_object = layer
        break

if bert_layer_object:
    # Iterate through all BERT encoder layers and compare initial vs. trained weights
    for i, bert_layer_trained in enumerate(bert_layer_object.bert.encoder.layer):
        # Check if this layer had initial weights stored and has trainable variables now
        if i in initial_bert_weights_dict and len(bert_layer_trained.trainable_variables) > 0:
            initial_weights_for_layer = initial_bert_weights_dict[i]
            # Get the first trainable variable (usually the query kernel)
            trained_weights_for_layer = bert_layer_trained.trainable_variables[0].numpy().copy()

            # Ensure shapes are compatible for comparison
            if initial_weights_for_layer.shape == trained_weights_for_layer.shape:
                print(f"Init BERT layer samples (first 5 values): {initial_weights_for_layer.flatten()[:5]}")
                print(f"Trained BERT layer samples (first 5 values): {trained_weights_for_layer.flatten()[:5]}")
                non_identical_weights_count = np.sum(initial_weights_for_layer != trained_weights_for_layer)
                total_weights_in_variable = initial_weights_for_layer.size

                variable_name = bert_layer_trained.trainable_variables[0].name

                print(f"Layer {i} (Variable: {variable_name}):")
                print(f"  Total weights in this variable: {total_weights_in_variable}")
                print(f"  Non-identical weights after training: {non_identical_weights_count}")
                if total_weights_in_variable > 0:
                    print(f"  Percentage of non-identical weights: { (non_identical_weights_count / total_weights_in_variable) * 100:.2f}%\n")
                else:
                    print("  (No weights to compare in this variable)\n")
            else:
                print(f"Error: Shape mismatch for Layer {i} during comparison.\n")
        elif i in initial_bert_weights_dict and len(bert_layer_trained.trainable_variables) == 0:
            print(f"Layer {i}: Had initial weights, but no trainable variables after model build/freeze check. (Likely frozen)\n")
        else:
            print(f"Layer {i}: No initial weights stored or no trainable variables to compare.\n")

    print("\n--- Overall Trainable Parameters for BERT Encoder ---")
    # Calculate total trainable params for the BERT layer
    overall_bert_trainable_params = np.sum([np.prod(v.shape) for v in bert_layer_object.trainable_variables])
    print(f"Total trainable parameters in BERT Encoder: {overall_bert_trainable_params}")
else:
    print("Error: Could not find a BERT layer in the model.")

## Reload checkpoint

In [None]:
# --- Reload and Verify Checkpoint ---
print("\n--- Reloading best checkpoint to verify ---")
# Find the latest checkpoint in the directory
latest = tf.train.latest_checkpoint(CHECKPOINT_DIR)

if latest:
    print(f"Found checkpoint: {latest}")
    # Load the weights
    model.load_weights(latest)
    print("Weights loaded. Evaluating on validation set...")

    # Evaluate to check if the loss matches the best val_loss from training
    val_loss = model.evaluate(X_val, y_val, batch_size=batch_size, verbose=1)
    print(f"Validation Loss from loaded checkpoint: {val_loss}")
else:
    print("No checkpoint found to reload.")

# Evaluation: Outdated

In [None]:
# from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
# import numpy as np
# import matplotlib.pyplot as plt


In [None]:

from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
import numpy as np
import matplotlib.pyplot as plt

if df is not None:

   # ---------------------------------
   # 2. Generate Regression Predictions
   # ---------------------------------
   print("Running predictions on validation set...")
   # The regression model outputs a continuous value
   val_raw_predictions = regression_model.predict(X_val, batch_size=128, verbose=1)
   val_raw_predictions = val_raw_predictions.flatten()

   # ---------------------------------
   # 3. Apply a fixed threshold for Binary Classification
   # ---------------------------------
   # As requested, skipping optimal threshold search and using a fixed threshold.
   best_threshold = 0.5 # Default threshold for binary classification

   print(f"\nUsing a fixed classification threshold: {best_threshold:.2f}")

   # ---------------------------------
   # 4. Final Evaluation Metrics
   # ---------------------------------
   # Generate final class predictions using the fixed threshold
   val_predictions = (val_raw_predictions > best_threshold).astype(np.float32)
   final_accuracy = accuracy_score(y_val, val_predictions)
   final_f1 = f1_score(y_val, val_predictions, zero_division=0)

   print("\n--- Final Validation Performance Report ---")
   print(f"Threshold used: {best_threshold:.2f}")
   print(f"Final Accuracy: {final_accuracy:.2f}")
   print(f"Final F1 Score: {final_f1:.4f}")
   print("\nConfusion Matrix:")
   # Format: [[TN, FP], [FN, TP]]
   print(confusion_matrix(y_val, val_predictions))

   print("\nClassification Report:")
   print(classification_report(y_val, val_predictions, target_names=['Dissimilar (0)', 'Similar (1)']))

else:
   print("Cannot evaluate: DataFrame was not loaded.")

# Load & Re-evaluate

In [None]:
# --- Loading the saved model with weight transfer ---
print("\n--- Loading the saved model via weight transfer ---")

# 1. Instantiate a FRESH, un-trained version of the model structure.
# This runs the __init__ but only creates the necessary layers (TFBertModel, Dense).
# We MUST use the MODEL_NAME in the __init__ to correctly configure the TFBertModel.
try:
    # Instantiate the model with the same MODEL_NAME used during training
    fresh_regression_model = MarketDiffRegressor(MODEL_NAME)

    # 2. You must call the model once to build its tensors before loading weights.
    # We'll use a single sample from your prepared validation data (X_val).
    _ = fresh_regression_model(
        {
            'input_ids_1': X_val['input_ids_1'][:1],
            'attention_mask_1': X_val['attention_mask_1'][:1],
            'input_ids_2': X_val['input_ids_2'][:1],
            'attention_mask_2': X_val['attention_mask_2'][:1]
        }
    )
    print("Fresh model structure built.")

    # 3. Load the weights directly from the saved file.
    # This bypasses the problematic tf.keras.models.load_model and ensures
    # only the fine-tuned weights are applied to the built structure.
    fresh_regression_model.load_weights(FULL_REGRESSION_SAVE_PATH)
    loaded_regression_model = fresh_regression_model # Use this for predictions
    print(f"Fine-tuned weights loaded successfully from: {FULL_REGRESSION_SAVE_PATH}")

except Exception as e:
    print(f"Error during weight loading: {e}")
    # Handle the error if weights cannot be loaded

# --- Re-evaluating with the loaded model ---
print("\n--- Re-evaluating with the loaded model ---")

if df is not None:
    # Generate Regression Predictions using the loaded model
    print("Running predictions on validation set with loaded model...")
    # Using the 'loaded_regression_model' (which is fresh_regression_model with loaded weights)
    loaded_val_raw_predictions = loaded_regression_model.predict(X_val, batch_size=128, verbose=1)
    loaded_val_raw_predictions = loaded_val_raw_predictions.flatten()

    # Apply the same fixed threshold for Binary Classification
    print(f"\nUsing a fixed classification threshold: {best_threshold:.2f}")

    # Generate final class predictions using the fixed threshold
    loaded_val_predictions = (loaded_val_raw_predictions > best_threshold).astype(np.float32)

    # Final Evaluation Metrics
    loaded_final_accuracy = accuracy_score(y_val, loaded_val_predictions)
    loaded_final_f1 = f1_score(y_val, loaded_val_predictions, zero_division=0)

    print("\n--- Evaluation Report for Loaded Model ---")
    print(f"Threshold used: {best_threshold:.2f}")
    print(f"Final Accuracy: {loaded_final_accuracy:.2f}")
    print(f"Final F1 Score: {loaded_final_f1:.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_val, loaded_val_predictions))

    print("\nClassification Report:")
    print(classification_report(y_val, loaded_val_predictions, target_names=['Dissimilar (0)', 'Similar (1)']))
else:
    print("Cannot evaluate: DataFrame was not loaded.")