# Implementation

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
# Ensure TensorFlow is using the Metal backend
print(tf.config.list_physical_devices('GPU'))
# Tokenize the texts
vocab_size = 10000  # Size of the vocabulary
max_length = 400  # Length of input sequences

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Define model parameters
embedding_dim = 300  # Dimension of the embedding vectors
lstm_units = 128  # Number of LSTM units
num_classes = 2  # Number of output classes (for binary classification)

# Define the input layer
input_text = Input(shape=(max_length,), dtype='int32', name='text_input')

# Embedding layer
embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length)(input_text)

# Two LSTM layers
x = LSTM(lstm_units, return_sequences=True)(embedding)
x = Dropout(0.5)(x)
x = LSTM(lstm_units)(x)
x = Dropout(0.5)(x)

# Output layer
output = Dense(num_classes, activation='softmax')(x)

# Define the model
model = Model(inputs=input_text, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()

In [None]:
# Train the model
model.fit(padded_sequences, labels, epochs=10, batch_size=32, validation_split=0.2)

# Embeddings

## Word2Vec

In [None]:
import re
import numpy as np
import spacy
from gensim.models import KeyedVectors
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Model

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    doc = nlp(text)
    stop_words = spacy.lang.en.stop_words.STOP_WORDS
    words = [token.text for token in doc if token.text.lower() not in stop_words and token.is_alpha]
    return words

def get_word2vec_embeddings(text, model, vector_size=300):
    tokens = preprocess_text(text)
    embeddings = np.zeros((len(tokens), vector_size))
    for i, token in enumerate(tokens):
        if token in model.wv:
            embeddings[i] = model.wv[token]
        else:
            embeddings[i] = np.zeros(vector_size)
    return embeddings

# Load pre-trained Word2Vec embeddings
word2vec_path = 'path/to/word2vec.bin'  # Replace with your Word2Vec file path
model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# Convert texts to embeddings
word2vec_embeddings = [get_word2vec_embeddings(text, model) for text in texts]
padded_embeddings = pad_sequences(word2vec_embeddings, padding='post', dtype='float32')

# Determine the max length of the sequences
max_length = padded_embeddings.shape[1]

# Define model parameters
embedding_dim = 300  # Dimension of the Word2Vec embeddings
lstm_units = 128  # Number of LSTM units
num_classes = 2  # Number of output classes (for binary classification)

# Define the input layer
input_text = Input(shape=(max_length, embedding_dim), dtype='float32', name='text_input')

# Two LSTM layers
x = LSTM(lstm_units, return_sequences=True)(input_text)
x = Dropout(0.5)(x)
x = LSTM(lstm_units)(x)
x = Dropout(0.5)(x)

# Output layer
output = Dense(num_classes, activation='softmax')(x)

# Define the model
model_word2vec = Model(inputs=input_text, outputs=output)

# Compile the model
model_word2vec.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Summary of the model
model_word2vec.summary()

# Train the model
model_word2vec.fit(padded_embeddings, labels, epochs=10, batch_size=32, validation_split=0.2)


## Bert

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout
from tensorflow.keras.models import Model
from transformers import BertTokenizer, TFBertModel

# Function to generate word embeddings for a given text
def generate_bert_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="tf", truncation=True, padding=True)
    outputs = model(inputs)
    embeddings = outputs.last_hidden_state
    return embeddings

model_name = "GerMedBERT/medbert-512"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertModel.from_pretrained(model_name)

bert_embeddings = [generate_bert_embeddings(text, tokenizer, model).numpy().squeeze(0) for text in texts]
padded_embeddings = pad_sequences(bert_embeddings, padding='post', dtype='float32')


# Define model parameters
lstm_units = 128  # Number of LSTM units
num_classes = 2  # Number of output classes (for binary classification)
embedding_dim = padded_embeddings.shape[-1]

# Define the input layers
input_text = Input(shape=(padded_embeddings.shape[1], embedding_dim), dtype='float32', name='text_input')

# Two LSTM layers
x = LSTM(lstm_units, return_sequences=True)(bert_embeddings)
x = Dropout(0.5)(x)
x = LSTM(lstm_units)(x)
x = Dropout(0.5)(x)

# Output layer
output = Dense(num_classes, activation='softmax')(x)

# Define the model
model_bert = Model(inputs=input_text, outputs=output)

# Compile the model
model_bert.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Summary of the model
model_bert.summary()

In [None]:
model_bert.fit(padded_embeddings, labels, epochs=10, batch_size=32, validation_split=0.2)

## Batchnormalization

### On input