In [None]:
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import custom_object_scope
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.keras import layers, models, losses, callbacks

from generative_text.general_tnn_generative.process import main
from generative_text.utils.fnProcessing import read_config

config_params = read_config(section='params', config_path='./generative_text/configkeras.ini')
config_clearml = read_config(section='clearml', config_path='./generative_text/configkeras.ini')

'''
Load dataframe with test column
'''

train_ds, val_ds, test_ds, combined_vocab = main(training_data, input_col='text', clean_col='text')

In [None]:
from generative_text.utils import fnContextPairing

included_entity_labels = ['PERSON', 'PRODUCT', 'ORG', 'FAC', 'NORP']
# Replace with your path to the dump file. Download here: https://dumps.wikimedia.org/other/wikibase/wikidatawiki/
path_to_dump_file = './drive/MyDrive/research/chanscope/data/simplewiki-20211001-pages-articles-multistream.xml'

context_pairing = ContextPairing(path_to_dump_file, included_entity_labels)
context_pairing.run(data)

### i. General Transformer Generative

#### 1. Local

In [None]:
from generative_text.general_generative_keras.tnn import TransformerBlock, TokenAndPositionEmbedding
from generative_text.general_generative_keras.train import train_model, TrainTextGenerator, CustomSchedule
from generative_text.general_generative_keras.evaluate import TextGenerator, CustomSchedule

config = configparser.ConfigParser()
config.read('./generative_text/configkeras.ini')
params = config["params"]
epochs = int(params['epochs']) 

LOAD_MODEL = False
MODEL_PATH = './models/general_generative/model_1.h5'

if LOAD_MODEL and os.path.exists(MODEL_PATH):
    model = train_model(preload_model=True, model_path=MODEL_PATH)
else:
    model = train_model(preload_model=False, model_path=MODEL_PATH)

def get_callbacks():
    model_checkpoint_callback = ModelCheckpoint(
        filepath='./models/general_generative/weights.{epoch:02d}-{val_loss:.2f}.ckpt',
        save_weights_only=False,
        save_best_only=True,
        monitor='val_loss',                                     
        verbose=1
    )
    text_generator = TrainTextGenerator(index_to_word=combined_vocab)
    early_stopping_callback = EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    )
    return [model_checkpoint_callback, text_generator, early_stopping_callback]

model.fit(
    train_ds,
    epochs=epochs,
    validation_data=val_ds,
    callbacks=get_callbacks(),
)
model.save(MODEL_PATH)

#### 2. ClearML

In [None]:
from clearml import Task, Dataset as ClearMLDataset
from generative_text.general_tnn_generative.process import main
from generative_text.general_tnn_generative.utils.fnProcessing import read_config
from generative_text.general_tnn_generative.utils.fnOps import ClearMLOps, ClearMLOpsTraining
from datetime import datetime
import os
import pandas as pd

config_params = read_config(section='params', config_path='./generative_text/configkeras.ini')
clearml_params = read_config(section='clearml', config_path='./generative_text/configkeras.ini')
clearml_project_name = clearml_params['clearml_project_name']
clearml_model_id = clearml_params['clearml_model_id']
clearml_output_uri = clearml_params['clearml_output_uri']
model_name = clearml_params['model_name']
model_path = clearml_params['model_path']

clean_training = ClearMLOpsTraining(clearml_params,config_params)
clean_ops = ClearMLOps(config_path='./generative_text/configkeras.ini')

clean_ops.set_creds_connect()
clean_ops.list_datasets(clearml_project_name)

In [None]:
task_name = 'Training Test 3'
model = clean_training.run_clearml_training_task('Training Test 3', dataset_id='df7aa04e1581485188eab3ee4df11ea1', training_data=None, vocab=None, load_model=False)

##### Evaluation

In [None]:
from generative_text.general_generative_keras.tnn import TransformerBlock, TokenAndPositionEmbedding
from generative.general_generative_keras.evaluate import TextGenerator, CustomSchedule

MODEL_PATH = './models/general_generative/model_1.h5'
with custom_object_scope({'CustomSchedule': CustomSchedule, 'TransformerBlock': TransformerBlock, 'TokenAndPositionEmbedding': TokenAndPositionEmbedding}):
    model = load_model(MODEL_PATH)

test_text_gen = TextGenerator(model=model, index_to_word=combined_vocab, top_k=15, generation_type='general', sampling_type='top_k')
info = test_text_gen.generate("Today in the news", max_tokens=50, temperature=1.0)

#### ii. Custom GPT Generative

"Fold" in more replies data according to training performance and system constraints

In [None]:
def update_replies(sample_size=150000, raw_data_path='/content/drive/MyDrive/research/chanscope/data/replies_raw_2.csv', replies_path='/content/drive/MyDrive/research/chanscope/data/replies/replies.csv'):
    # Read and sample the raw data
    raw_data = pd.read_csv(raw_data_path).sample(sample_size)

    # Prepare the data
    prepared_data = prepare_data(raw_data)
    thread_headers = prepared_data.dropna(subset=['text_clean','posted_comment'])[['thread_id', 'thread_header', 'posted_comment', 'posted_date_time']]

    # Find and augment dialogs
    new_replies = find_dialogs(thread_headers)
    new_replies = augment_dialogs(new_replies, prepared_data)
    new_replies = new_replies.dropna()

    # Read the existing replies and append new ones
    complete_replies = pd.read_csv(replies_path)
    complete_replies = pd.concat([complete_replies, new_replies]).drop_duplicates().reset_index(drop=True)

    # Save the updated replies
    complete_replies.to_csv(replies_path, index=False)

    # Remove the sampled data from the original dataset and save it
    remaining_data = pd.read_csv(raw_data_path)
    remaining_data = remaining_data.loc[~remaining_data.index.isin(raw_data.index)]
    remaining_data.to_csv(raw_data_path, index=False)
    return complete_replies, remaining_data

remaining_data,complete_replies = update_replies()

In [None]:
import pandas as pd
from utils.fnProcessing import find_dialogs, augment_dialogs,view_shapes
import os
import tensorflow as tf
from generative_text.general_gpt_generative.preprocessing import DirectoryManager  
from generative_text.general_gpt_generative.preprocessing import initialize_and_prepare  
from generative_text.general_gpt_generative.processing import process_data
from generative_text.general_gpt_generative.evaluate import plot_text_pair_distribution, count_tokens_and_lengths,plot_history
import pandas as pd
import configparser

config = configparser.ConfigParser()
config.read('./generative_text/configcustom.ini')
config_params = config['params']
config_paths = config['paths']
paths = {key: config_paths[key] for key in config_paths}
base_directory = paths['metadata_path']
params = {key: config_params[key] for key in config_params}
max_len = int(params['max_len'])
vocab_size = int(params['vocab_size'])
embedding_dim = int(params['embedding_dim'])
num_heads = int(params['n_heads'])
num_layers = int(params['n_layers'])
key_dim = int(params['key_dim'])
ff_dim = int(params['feed_forward_dim'])
dropout_rate = float(params['dropout'])
warmup_steps = int(params['warmup_steps'])
activation = params['activation']
epoch = int(params['epochs'])

data_path = '../replies.csv'
replies = pd.read_csv(f'{data_path}').drop_duplicates().sample(50)

train_ds, val_ds, test_ds, vectorizer, text_pairs  = process_data(replies, base_directory, params)

def view_shapes(dataset):
    for example_input, example_target in dataset.take(1):
        print(f"Input shape: {example_input.shape}")
view_shapes(train_ds)

# count tokens
comment_tokens, response_comment_tokens = set(), set()
comment_maxlen, response_maxlen = 0, 0
for comment, response in text_pairs:
    comment_tok, response_tok = comment.split(), response.split()
    comment_maxlen = max(comment_maxlen, len(comment_tok))
    response_maxlen = max(response_maxlen, len(response_tok))
    comment_tokens.update(comment_tok)
    response_comment_tokens.update(response_tok)

print(f"Total Comment tokens: {len(comment_tokens)}")
print(f"Total Response tokens: {len(response_comment_tokens)}")
print(f"Max Comment length: {comment_maxlen}")
print(f"Max Response length: {response_maxlen}")
print(f"{len(text_pairs)} total pairs")

In [None]:
from generative_text.general_gpt_generative.tnn import transformer, CustomSchedule, masked_loss, masked_accuracy
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow as tf
import os
import logging
import numpy as np

logging.basicConfig(level=logging.INFO)
SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)

model_path = './models/general_gpt_custom/'
model_name = 'best_model_1'
model_full_path = os.path.join(model_path, model_name)

# Check if the model exists and load it; otherwise, create a new one
if os.path.exists(model_full_path):
    logging.info(f"Loading model from {model_full_path}")
    transformer_model = tf.keras.models.load_model(model_full_path, custom_objects={
        'masked_loss': masked_loss,
        'masked_accuracy': masked_accuracy,
        'CustomSchedule': CustomSchedule,
        'PositionalEmbedding': PositionalEmbedding
    })
else:
    logging.info("Initializing a new model.")
    transformer_model = transformer(
        num_layers=num_layers,
        num_heads=num_heads,
        key_dim=key_dim,
        ff_dim=ff_dim,
        vocab_size=len(vectorizer.get_vocabulary()),
        dropout=dropout_rate
    )
    # Define the learning rate schedule and optimizer
    lr_schedule = CustomSchedule(key_dim, warmup_steps)
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, clipnorm=1.0, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
    # Compile the model
    transformer_model.compile(
        loss=masked_loss, 
        optimizer=optimizer, 
        metrics=[masked_accuracy]
    )

callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=20, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint(
        filepath=os.path.join(model_path,'/weights.{epoch:02d}-{val_loss:.2f}.ckpt'), 
        monitor='val_loss', save_best_only=True, save_weights_only=False, save_format='tf')]

# Train the model
history = transformer_model.fit(
    train_ds, 
    epochs=epoch, 
    validation_data=val_ds, 
    callbacks=callbacks)

test_loss, test_accuracy = transformer_model.evaluate(test_ds)
transformer_model.save(model_full_path)

In [None]:
plot_history(history)

In [None]:
def decode_sequence(sequence, vectorizer):
    vocab = vectorizer.get_vocabulary()
    return " ".join([vocab[i] for i in sequence if i < len(vocab)])

def evaluate_translation(model, dataset, vectorizer, num_samples=5):
    """
    Evaluate the translation performance of the model by generating responses
    for a given number of samples from the dataset.

    Args:
    - model: The trained GPT-style transformer model.
    - dataset: A tf.data.Dataset object containing the input and target pairs.
    - vectorizer: The TextVectorization layer used for tokenizing text.
    - num_samples: Number of samples to evaluate.

    Returns:
    - None
    """
    # Select a few samples from the dataset
    for input_text, _ in dataset.take(num_samples):
        # Model prediction
        predicted_sequence = model.predict(input_text)

        # Decode the input text
        input_text_decoded = decode_sequence(input_text[0].numpy(), vectorizer)

        # Decode the predicted sequence
        predicted_text = decode_sequence(np.argmax(predicted_sequence[0], axis=-1), vectorizer)

        # Print the results
        print("Input Text: ", input_text_decoded)
        print("Generated Response: ", predicted_text)
        print("-" * 50)

# Example usage
evaluate_translation(transformer_model, test_ds, vectorizer)