In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import collections


from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalAveragePooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.callbacks import LearningRateScheduler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Lambda
from tensorflow.keras.layers import Layer
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Embedding
from tensorflow.keras.regularizers import l2

In [None]:
strategy = tf.distribute.MirroredStrategy()
data = pd.read_csv("/content/drive/MyDrive/767project/Shakespeare_data.csv")

In [None]:
data.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


In [None]:
dataset = data['PlayerLine']
subset_size = int(len(dataset) * 0.10)
subset_indices = np.random.choice(range(len(dataset)), size=subset_size, replace=False)
subset_dataset = dataset[subset_indices]
dataset = subset_dataset

In [None]:
corpus = []
with strategy.scope():
    for line in dataset:
        lowercase_line = line.lower()
        corpus.append(lowercase_line)
corpus[:10]

['scene i. the english camp at agincourt.',
 'fond done, done fond,',
 'damnation: but this is not so: the king is not',
 'exeunt',
 'hautboys',
 'now let hot aetna cool in sicily,',
 'dumb, yet are they much too light for the bore of',
 'a fever with the absence of her son,',
 'with her her niece, the lady blanch of spain,',
 'and meet me presently at salisbury.']

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word_to_token = tokenizer.word_index
def key_pair(num):
    count=0
    for key, value in word_to_token.items():
        if count>=num: break
        print(f''''{key:}': {value},''')
        count +=1
key_pair(10)

'the': 1,
'and': 2,
'i': 3,
'to': 4,
'of': 5,
'a': 6,
'you': 7,
'my': 8,
'in': 9,
'that': 10,


In [None]:
input_sequences = []
with strategy.scope():
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)


In [None]:
input_sequences[:5]

[[111, 3],
 [111, 3, 1],
 [111, 3, 1, 525],
 [111, 3, 1, 525, 700],
 [111, 3, 1, 525, 700, 46]]

In [None]:
before = input_sequences[1]
max_seq_len = max(len(x) for x in input_sequences)
print(max_seq_len)

68


In [None]:
total_words = len(word_to_token)+1
print(total_words)

9030


In [None]:
# Padding
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_seq_len, padding = 'pre'))
after = input_sequences[1]

In [None]:
print(f'Before: {before}')
print(f'After: {after}')

Before: [111, 3, 1]
After: [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0 111   3   1]


In [None]:
features, labels = input_sequences[:, :-1], input_sequences[:, -1],
labels = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [None]:
def transformer_model_modified(total_words, max_seq_len):

    d_model = 64
    num_heads = 4
    ff_dim = 128

    # input Layer
    inputs = Input(shape=(max_seq_len-1,))
    # word embedding
    embedding_layer = Embedding(total_words, d_model)(inputs)

    # Muti head attention
    transformer_block = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
    attn_output = transformer_block(embedding_layer, embedding_layer)
    # connection
    attn_output = tf.keras.layers.Add()([attn_output, embedding_layer])
    # Normailzation
    attn_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attn_output)

    # feed forward network
    ffn_output = tf.keras.layers.Dense(ff_dim, activation='relu')(attn_output)
    ffn_output = tf.keras.layers.Dense(d_model)(ffn_output)

    ffn_output = tf.keras.layers.Add()([ffn_output, attn_output])
    # Normailzation
    seq_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(ffn_output)

    # outputlayer
    final_output = Lambda(lambda x: x[:, -1, :])(seq_output)
    outputs = Dense(total_words, activation='softmax')(final_output)

    model = Model(inputs=inputs, outputs=outputs)
    return model



In [None]:
transformer_model = transformer_model_modified(total_words, max_seq_len)

In [None]:
transformer_model.compile(
    optimizer=Adam(),
    loss=CategoricalCrossentropy(),
    metrics=['accuracy']
)


transformer_model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_6 (InputLayer)        [(None, 47)]                 0         []                            
                                                                                                  
 embedding_5 (Embedding)     (None, 47, 64)               577472    ['input_6[0][0]']             
                                                                                                  
 multi_head_attention_5 (Mu  (None, 47, 64)               66368     ['embedding_5[0][0]',         
 ltiHeadAttention)                                                   'embedding_5[0][0]']         
                                                                                                  
 add_10 (Add)                (None, 47, 64)               0         ['multi_head_attention_5

In [None]:
EPOCHS = 50
BATCH_SIZE = 8
history = transformer_model.fit(features, labels, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
def test_generator(model, string, num):
    if len(string) == 0:
        print("Error: No word found")
        return

    for _ in range(num):
        token_list = tokenizer.texts_to_sequences([string])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding="pre")
        # Predict the next word base on model
        probabilities = model.predict(token_list)[0]
        # Use the highest probability words
        predicted = np.argmax(probabilities)
        if predicted != 0:
            generated_word = tokenizer.index_word[predicted]
            string += " " + generated_word

    print(string)

In [None]:
test_generator(transformer_model, "long live the king", 10)

long live the king enlarge happy habiliment fee lint boast nay 'this perverseness tread


In [None]:
test_generator(transformer_model, "Wherefore art thou ", 5)

Wherefore art thou  deform'd seed meet grave mistaken


In [None]:
test_generator(transformer_model, "thee", 10)

thee shrinking maps dined beseech false wild erring will't virtues came


In [None]:
import gensim.downloader as api

model = api.load('fasttext-wiki-news-subwords-300')


print(model.most_similar('apple'))


[('apples', 0.8046640753746033), ('pear', 0.6897592544555664), ('peach', 0.6626990437507629), ('fruit', 0.6596963405609131), ('apple-', 0.6546189785003662), ('appley', 0.6466962099075317), ('pippin', 0.6454442143440247), ('pome', 0.6110042333602905), ('apple-tree', 0.6037725210189819), ('berry', 0.602673351764679)]


In [None]:
embedding_matrix = np.zeros((total_words, 300))
for word, index in tokenizer.word_index.items():
    if word in model:
        embedding_vector = model[word]
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [None]:


def transformer_model_modified_FastText(total_words, max_seq_len):
    d_model = 300
    num_heads = 4
    ff_dim = 128

    inputs = Input(shape=(max_seq_len-1,))

    # FastText Word embedding
    embedding_layer = Embedding(total_words, d_model, weights=[embedding_matrix], trainable=False)(inputs)
    # Muti head attention
    transformer_block = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
    attn_output = transformer_block(embedding_layer, embedding_layer)
    # connection
    attn_output = tf.keras.layers.Add()([attn_output, embedding_layer])
    # Normailzation
    attn_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attn_output)

    # feed forward network
    ffn_output = tf.keras.layers.Dense(ff_dim, activation='relu')(attn_output)
    ffn_output = tf.keras.layers.Dense(d_model)(ffn_output)

    ffn_output = tf.keras.layers.Add()([ffn_output, attn_output])
    # Normailzation
    seq_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(ffn_output)

    # outputlayer
    final_output = Lambda(lambda x: x[:, -1, :])(seq_output)
    outputs = Dense(total_words, activation='softmax')(final_output)

    model = Model(inputs=inputs, outputs=outputs)
    return model

In [None]:

# Split the data
X_train, X_temp, y_train, y_temp = train_test_split(features, labels, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Model
transformer_model_modified_FastText = transformer_model_modified_FastText(total_words, max_seq_len)

transformer_model_modified_FastText.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.CategoricalCrossentropy(),
    metrics=['accuracy']
)

transformer_model_modified_FastText.summary()



Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, 67)]                 0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, 67, 300)              2709000   ['input_3[0][0]']             
                                                                                                  
 multi_head_attention_2 (Mu  (None, 67, 300)              1443900   ['embedding_2[0][0]',         
 ltiHeadAttention)                                                   'embedding_2[0][0]']         
                                                                                                  
 add_4 (Add)                 (None, 67, 300)              0         ['multi_head_attention_2

In [None]:
# Fasttext model training
EPOCHS = 50
BATCH_SIZE = 8
history = transformer_model_modified_FastText.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=1
)


test_loss, test_accuracy = transformer_model_modified_FastText.evaluate(X_test, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Loss: 9.193650245666504
Test Accuracy: 0.04968944191932678


In [None]:
test_generator(transformer_model_modified_FastText, "thee", 15)

thee i'll i am a weary but two watched night and did perceive not sworn two


In [None]:
test_generator(transformer_model_modified_FastText, "Wherefore art thou ", 10)

Wherefore art thou  not thy low thy low thy low thy foe thou


In [None]:
word2vec_model = api.load('word2vec-google-news-300')



In [None]:

embedding_matrix = np.zeros((total_words, 300))

for word, i in tokenizer.word_index.items():
    if word in word2vec_model:
        embedding_vector = word2vec_model[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [None]:


def transformer_model_modified_Word2Vec(total_words, max_seq_len):
    d_model = 300
    num_heads = 4
    ff_dim = 128

    inputs = Input(shape=(max_seq_len-1,))

    # Word2Vec Word embedding
    embedding_layer = Embedding(total_words, d_model, weights=[embedding_matrix], trainable=False)(inputs)
    # Muti head attention
    transformer_block = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
    attn_output = transformer_block(embedding_layer, embedding_layer)
    # connection
    attn_output = tf.keras.layers.Add()([attn_output, embedding_layer])
    # Normailzation
    attn_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attn_output)

    # feed forward network
    ffn_output = tf.keras.layers.Dense(ff_dim, activation='relu')(attn_output)
    ffn_output = tf.keras.layers.Dense(d_model)(ffn_output)

    ffn_output = tf.keras.layers.Add()([ffn_output, attn_output])
    # Normailzation
    seq_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(ffn_output)

    # outputlayer
    final_output = Lambda(lambda x: x[:, -1, :])(seq_output)
    outputs = Dense(total_words, activation='softmax')(final_output)

    model = Model(inputs=inputs, outputs=outputs)
    return model

In [None]:
# Word2Vec Model
transformer_model_modified_Word2Vec = transformer_model_modified_Word2Vec(total_words, max_seq_len)

transformer_model_modified_Word2Vec.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.CategoricalCrossentropy(),
    metrics=['accuracy']
)

transformer_model_modified_Word2Vec.summary()


Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_5 (InputLayer)        [(None, 67)]                 0         []                            
                                                                                                  
 embedding_4 (Embedding)     (None, 67, 300)              2709000   ['input_5[0][0]']             
                                                                                                  
 multi_head_attention_4 (Mu  (None, 67, 300)              1443900   ['embedding_4[0][0]',         
 ltiHeadAttention)                                                   'embedding_4[0][0]']         
                                                                                                  
 add_8 (Add)                 (None, 67, 300)              0         ['multi_head_attention_4

In [None]:
# Word2Vec model training
EPOCHS = 50
BATCH_SIZE = 8
history = transformer_model_modified_Word2Vec.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=1
)






Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
test_generator(transformer_model_modified_Word2Vec, "Wherefore art thou ", 10)

Wherefore art thou  come come come come come come come come come come


In [None]:
test_generator(transformer_model_modified_Word2Vec, "thee", 15)

thee well served in the watch glory glory glory glory glory glory glory glory glory glory


# From the result and preformance, the orignal transformer have the best performance,Fine-tunning the model

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import collections


from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalAveragePooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.callbacks import LearningRateScheduler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Lambda
from tensorflow.keras.layers import Layer
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Embedding
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Dense, Layer, Dropout, LayerNormalization, Add
from tensorflow.keras.layers import MultiHeadAttention

In [None]:
# Initialize the strategy for distributed training
strategy = tf.distribute.MirroredStrategy()

# Load and subset the data
data = pd.read_csv("/content/drive/MyDrive/767project/Shakespeare_data.csv")
dataset = data['PlayerLine']
subset_size = int(len(dataset) * 0.5)
subset_indices = np.random.choice(range(len(dataset)), size=subset_size, replace=False)
subset_dataset = dataset.iloc[subset_indices]
dataset = subset_dataset

# Clean text and tokenize
tokenizer = Tokenizer()
corpus = []
original_lines = []
lengths = []

with strategy.scope():
    for line in dataset:
        clean_line = re.sub(r'[^a-zA-Z\s]', '', line.lower())
        if len(clean_line.split()) > 3:
            corpus.append(clean_line)
            original_lines.append(line)
            lengths.append(len(clean_line.split()))

print("min:", min(lengths))
print("max:", max(lengths))
print("mean:", np.mean(lengths))
print("median:", np.median(lengths))
print("90%<=:", np.percentile(lengths, 90))

tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)
total_words = len(tokenizer.word_index) + 1  # +1 for padding token

# Define new_max_length
new_max_length = int(np.percentile(lengths, 90))

# Pair sequences with their original lengths and lines
sequence_data = list(zip(sequences, original_lines, lengths))
sequence_data.sort(key=lambda x: x[2])  # Sort by length

# Define buckets
buckets = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
bucket_data = collections.defaultdict(list)

# Assign sequences to buckets based on length
for seq, original, length in sequence_data:
    bucket_idx = np.digitize([length], buckets)[0]
    if bucket_idx >= len(buckets):
        bucket_idx = len(buckets) - 1  # Place in the last bucket if it exceeds the max length
    bucket_data[buckets[bucket_idx]].append(seq)

def dynamic_bucketed_data_generator(bucket_data, batch_size, total_words, report_padding_ratio=False):
    while True:
        for bucket_length, sequences in bucket_data.items():
            np.random.shuffle(sequences)
            for i in range(0, len(sequences), batch_size):
                batch = sequences[i:i + batch_size]
                max_len_in_batch = max(len(seq) for seq in batch)
                batch_padded = pad_sequences(batch, maxlen=max_len_in_batch, padding='post', truncating='post')

                # Calculate padding ratio
                if report_padding_ratio:
                    total_elements = np.prod(batch_padded.shape)
                    padded_elements = np.sum(batch_padded == 0)  # Assuming 0 is the padding token
                    padding_ratio = padded_elements / total_elements


                features = batch_padded[:, :-1]
                labels = batch_padded[:, -1]
                labels = to_categorical(labels, num_classes=total_words)

                yield features, labels

# Initialize bucket data structures for training, validation, and testing
train_bucket_data = collections.defaultdict(list)
val_bucket_data = collections.defaultdict(list)
test_bucket_data = collections.defaultdict(list)

# Prepare data for training, validation, and testing
all_data = [seq for seqs in bucket_data.values() for seq in seqs]
train_data, temp_data = train_test_split(all_data, test_size=0.4, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Assign sequences back to appropriate buckets
for seq in train_data:
    bucket_idx = np.digitize([len(seq)], buckets)[0]
    if bucket_idx >= len(buckets):
        bucket_idx = len(buckets) - 1
    train_bucket_data[buckets[bucket_idx]].append(seq)

for seq in val_data:
    bucket_idx = np.digitize([len(seq)], buckets)[0]
    if bucket_idx >= len(buckets):
        bucket_idx = len(buckets) - 1
    val_bucket_data[buckets[bucket_idx]].append(seq)

for seq in test_data:
    bucket_idx = np.digitize([len(seq)], buckets)[0]
    if bucket_idx >= len(buckets):
        bucket_idx = len(buckets) - 1
    test_bucket_data[buckets[bucket_idx]].append(seq)

# Create generators using the new bucketed data structures
train_generator = dynamic_bucketed_data_generator(train_bucket_data, batch_size=64, total_words=total_words, report_padding_ratio=True)
val_generator = dynamic_bucketed_data_generator(val_bucket_data, batch_size=64, total_words=total_words)
test_generator = dynamic_bucketed_data_generator(test_bucket_data, batch_size=64, total_words=total_words)

# Print total words
print(f"Total words: {total_words}")




min: 4
max: 72
mean: 7.853289734443123
median: 8.0
90%<=: 10.0
Total words: 19744


In [None]:
# Test the generator for padding ratio print
for _ in range(5):
    features, labels = next(train_generator)

In [None]:
#KAN simple version
def create_padding_mask(seq):
    mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return mask[:, tf.newaxis, tf.newaxis, :]

def swish(x):
    return x * tf.sigmoid(x)

def transformer_model_with_kan(total_words, max_seq_len):
    d_model = 64
    num_heads = 4
    ff_dim = 128

    inputs = Input(shape=(max_seq_len,), name="inputs")
    embedding_layer = Embedding(total_words, d_model)(inputs)
    padding_mask = create_padding_mask(inputs)

    transformer_block = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
    attn_output = transformer_block(embedding_layer, embedding_layer, attention_mask=padding_mask)

    attn_output = tf.keras.layers.Add()([attn_output, embedding_layer])
    attn_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attn_output)
    attn_output = Dropout(0.1)(attn_output)

    kan_layer = KANLayer(d_model, ff_dim)
    kan_output = kan_layer(attn_output)
    kan_output = Dense(d_model, kernel_regularizer=l2(0.01))(kan_output)

    kan_output = tf.keras.layers.Add()([kan_output, attn_output])
    seq_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(kan_output)
    seq_output = Dropout(0.1)(seq_output)

    final_output = Lambda(lambda x: x[:, -1, :])(seq_output)
    outputs = Dense(total_words, activation='softmax')(final_output)

    model = Model(inputs=inputs, outputs=outputs)
    return model

class SimplePolynomialLayer(Layer):
    def __init__(self, input_dim, output_dim):
        super(SimplePolynomialLayer, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.coefficients = self.add_weight(shape=(input_dim, output_dim, 3), initializer='random_normal', trainable=True)

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        sequence_length = tf.shape(inputs)[1]

        x = tf.expand_dims(inputs, -1)  # Shape: (batch_size, sequence_length, input_dim, 1)
        x2 = tf.pow(x, 2)
        x3 = tf.pow(x, 3)
        poly_terms = tf.concat([x, x2, x3], axis=-1)  # Shape: (batch_size, sequence_length, input_dim, 3)

        # Using einsum for correct broadcasting
        output = tf.einsum('bsij,ioj->bso', poly_terms, self.coefficients)  # Shape: (batch_size, sequence_length, output_dim)
        return output

class KANLayer(Layer):
    def __init__(self, input_dim, output_dim):
        super(KANLayer, self).__init__()
        self.poly_layer = SimplePolynomialLayer(input_dim, output_dim)

    def call(self, inputs):
        return self.poly_layer(inputs)

In [None]:

class SimplePolynomialLayer(Layer):
    def __init__(self, input_dim, output_dim):
        super(SimplePolynomialLayer, self).__init__()
        self.coefficients = self.add_weight(
            shape=(input_dim, output_dim, 3),
            initializer='random_normal',
            trainable=True
        )

    def call(self, inputs):
        x = tf.expand_dims(inputs, -1)
        x2 = tf.pow(x, 2)
        x3 = tf.pow(x, 3)
        poly_terms = tf.concat([x, x2, x3], axis=-1)
        return tf.einsum('bsij,ioj->bso', poly_terms, self.coefficients)

class EnhancedKANLayer(Layer):
    def __init__(self, input_dim, output_dim, dropout_rate=0.1):
        super(EnhancedKANLayer, self).__init__()
        self.poly_layer = SimplePolynomialLayer(input_dim, output_dim)
        self.dropout = Dropout(dropout_rate)
        self.norm = LayerNormalization(epsilon=1e-6)
        self.dense = Dense(input_dim)  # Adjust dimensions

    def call(self, inputs):
        x = self.poly_layer(inputs)
        x = tf.nn.relu(x)
        x = self.dropout(x)
        x = self.dense(x)
        return self.norm(x + inputs)

def create_padding_mask(seq):
    return tf.cast(tf.math.equal(seq, 0), tf.float32)[:, tf.newaxis, tf.newaxis, :]

def transformer_model_with_enhanced_kan(total_words, max_seq_len):
    d_model = 64
    num_heads = 4

    inputs = Input(shape=(max_seq_len,), name="inputs")
    embedding_layer = Embedding(total_words, d_model)(inputs)
    padding_mask = create_padding_mask(inputs)
    transformer_block = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
    attn_output = transformer_block(embedding_layer, embedding_layer, attention_mask=padding_mask)
    attn_output = Add()([attn_output, embedding_layer])
    attn_output = LayerNormalization(epsilon=1e-6)(attn_output)

    kan_layer = EnhancedKANLayer(d_model, d_model)
    kan_output = kan_layer(attn_output)
    kan_output = Add()([kan_output, attn_output])
    seq_output = LayerNormalization(epsilon=1e-6)(kan_output)

    final_output = Lambda(lambda x: x[:, -1, :])(seq_output)
    outputs = Dense(total_words, activation='softmax')(final_output)

    model = Model(inputs=inputs, outputs=outputs)
    return model

In [None]:
# transformer Orginal
def create_padding_mask(seq):
    mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return mask[:, tf.newaxis, tf.newaxis, :]

def swish(x):
    return x * tf.sigmoid(x)

def transformer_model_orginal_final(total_words, max_seq_len):
    d_model = 64
    num_heads = 4
    ff_dim = 128

    # Input layer
    inputs = Input(shape=(max_seq_len,), name="inputs")
    # Word embedding
    embedding_layer = Embedding(total_words, d_model)(inputs)
    padding_mask = create_padding_mask(inputs)
    # Multi-head attention
    transformer_block = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
    attn_output = transformer_block(embedding_layer, embedding_layer, attention_mask=padding_mask)
    # Connection
    attn_output = tf.keras.layers.Add()([attn_output, embedding_layer])
    # Normalization
    attn_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attn_output)
    attn_output = Dropout(0.1)(attn_output)  # Add Dropout layer

    # Feed forward network
    ffn_output = tf.keras.layers.Dense(ff_dim, activation=swish)(attn_output)
    ffn_output = tf.keras.layers.Dense(d_model)(ffn_output)
    ffn_output = Dropout(0.1)(ffn_output)  # Add Dropout layer

    ffn_output = tf.keras.layers.Add()([ffn_output, attn_output])
    # Normalization
    seq_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(ffn_output)

    # Output layer
    final_output = Lambda(lambda x: x[:, -1, :])(seq_output)
    outputs = Dense(total_words, activation='softmax')(final_output)

    model = Model(inputs=inputs, outputs=outputs)
    return model


In [None]:
# KAN Transformer
initial_learning_rate = 0.0001
num_epochs = 50
batch_size = 64
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.1,
    patience=5,
    verbose=1,
    min_lr=0.00001
)
class PrintLR(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        print("\nEpoch", epoch+1, "current learning rate:", tf.keras.backend.get_value(self.model.optimizer.lr))
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
transformer_model_with_kan = transformer_model_with_kan(total_words=total_words, max_seq_len=new_max_length-1)

# 编译模型
model = transformer_model_with_kan(total_words=total_words, max_seq_len=new_max_length-1)
model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.CategoricalCrossentropy(),
    metrics=['accuracy']
)

# 打印模型摘要
model.summary()

# 训练模型
history = model.fit(
    train_generator,
    steps_per_epoch=len(train_data) // batch_size,
    epochs=num_epochs,
    validation_data=val_generator,
    validation_steps=len(val_data) // batch_size,
    callbacks=[PrintLR()]
)

ValueError: The first argument to `Layer.call` must always be passed.

In [None]:
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.1,
    patience=5,
    verbose=1,
    min_lr=0.00001
)
class PrintLR(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        print("\nEpoch", epoch+1, "current learning rate:", tf.keras.backend.get_value(self.model.optimizer.lr))
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
transformer_model_orginal_final = transformer_model_orginal_final(total_words=total_words, max_seq_len=new_max_length-1)

transformer_model_orginal_final.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.CategoricalCrossentropy(),
    metrics=['accuracy']
)

transformer_model_orginal_final.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 inputs (InputLayer)         [(None, 9)]                  0         []                            
                                                                                                  
 tf.math.equal_3 (TFOpLambd  (None, 9)                    0         ['inputs[0][0]']              
 a)                                                                                               
                                                                                                  
 tf.cast_3 (TFOpLambda)      (None, 9)                    0         ['tf.math.equal_3[0][0]']     
                                                                                                  
 embedding_3 (Embedding)     (None, 9, 64)                1263296   ['inputs[0][0]']        

In [None]:
history = transformer_model_orginal_final.fit(
    train_generator,
    steps_per_epoch=len(train_data) // batch_size,
    epochs=num_epochs,
    validation_data=val_generator,
    validation_steps=len(val_data) // batch_size,
    callbacks=[lr_scheduler, PrintLR()]
)

Epoch 1/50
Epoch 1 current learning rate: 1e-04
Epoch 2/50
Epoch 2 current learning rate: 1e-04
Epoch 3/50
Epoch 3 current learning rate: 1e-04
Epoch 4/50
Epoch 4 current learning rate: 1e-04
Epoch 5/50
Epoch 5 current learning rate: 1e-04
Epoch 6/50
Epoch 6 current learning rate: 1e-04
Epoch 7/50
Epoch 7 current learning rate: 1e-04
Epoch 8/50
Epoch 8 current learning rate: 1e-04
Epoch 9/50
Epoch 9 current learning rate: 1e-04
Epoch 10/50
Epoch 10 current learning rate: 1e-04
Epoch 11/50
Epoch 11 current learning rate: 1e-04
Epoch 12/50
Epoch 12: ReduceLROnPlateau reducing learning rate to 1e-05.

Epoch 12 current learning rate: 1e-05
Epoch 13/50
Epoch 13 current learning rate: 1e-05
Epoch 14/50
Epoch 14 current learning rate: 1e-05
Epoch 15/50
Epoch 15 current learning rate: 1e-05
Epoch 16/50
Epoch 16 current learning rate: 1e-05
Epoch 17/50
Epoch 17 current learning rate: 1e-05
Epoch 18/50
Epoch 18 current learning rate: 1e-05
Epoch 19/50
Epoch 19 current learning rate: 1e-05
Epoch 

In [None]:

model_save_path = '/content/drive/My Drive/My Models/transformer_model_modified_final.h5'


transformer_model_modified_final.save(model_save_path)


  saving_api.save_model(


In [None]:
import os

print(os.listdir('/content/drive/My Drive/My Models'))


['transformer_model_modified_final.h5']


In [None]:
import random

def test_generator(model, seed_text, num_words, temperature=1.0):
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=new_max_length-1, padding='post')

        predicted_probs = model.predict(token_list, verbose=0)[0]  # Get the first sequence
        # Apply temperature scaling
        predictions = np.log(predicted_probs + 1e-10) / temperature  # Adding a small constant to avoid log(0)
        exp_predictions = np.exp(predictions)
        predicted_probs = exp_predictions / np.sum(exp_predictions)

        # Ensure <pad> token (usually index 0) is never chosen
        predicted_probs[0] = 0
        predicted_probs = predicted_probs / np.sum(predicted_probs)  # Re-normalize probabilities

        # Select a word based on the probability distribution
        try:
            predicted_index = np.random.choice(range(len(predicted_probs)), p=predicted_probs)
        except ValueError:
            print("Error: Probability distribution does not sum to 1.")
            continue

        predicted_word = tokenizer.index_word.get(predicted_index, '')

        if predicted_word == '':
            print("No valid prediction; check the model's output.")
            break

        seed_text += " " + predicted_word
        print("Generated so far:", seed_text)  # Print the generated text so far

    return seed_text

In [None]:
test_generator(transformer_model_modified_final, "Wherefore art thou ", 20)

Generated so far: Wherefore art thou  fallst
Generated so far: Wherefore art thou  fallst commonwealth
Generated so far: Wherefore art thou  fallst commonwealth unspeakable
Generated so far: Wherefore art thou  fallst commonwealth unspeakable foulness
Generated so far: Wherefore art thou  fallst commonwealth unspeakable foulness out
Generated so far: Wherefore art thou  fallst commonwealth unspeakable foulness out my
Generated so far: Wherefore art thou  fallst commonwealth unspeakable foulness out my fond
Generated so far: Wherefore art thou  fallst commonwealth unspeakable foulness out my fond of
Generated so far: Wherefore art thou  fallst commonwealth unspeakable foulness out my fond of fold
Generated so far: Wherefore art thou  fallst commonwealth unspeakable foulness out my fond of fold night
Generated so far: Wherefore art thou  fallst commonwealth unspeakable foulness out my fond of fold night another
Generated so far: Wherefore art thou  fallst commonwealth unspeakable foulnes

'Wherefore art thou  fallst commonwealth unspeakable foulness out my fond of fold night another hit of another flesh as i am i am'

In [None]:
test_generator(transformer_model_modified_final, "extremities he endured", 8)

Generated so far: extremities he endured kates
Generated so far: extremities he endured kates traditional
Generated so far: extremities he endured kates traditional nedars
Generated so far: extremities he endured kates traditional nedars felt
Generated so far: extremities he endured kates traditional nedars felt there
Generated so far: extremities he endured kates traditional nedars felt there can
Generated so far: extremities he endured kates traditional nedars felt there can never
Generated so far: extremities he endured kates traditional nedars felt there can never foul


'extremities he endured kates traditional nedars felt there can never foul'

In [None]:
test_generator(transformer_model_modified_final, "thee", 6)

Generated so far: thee prefixd
Generated so far: thee prefixd allottery
Generated so far: thee prefixd allottery fingering
Generated so far: thee prefixd allottery fingering newlighted
Generated so far: thee prefixd allottery fingering newlighted induce
Generated so far: thee prefixd allottery fingering newlighted induce bridegroom


'thee prefixd allottery fingering newlighted induce bridegroom'