<a href="https://colab.research.google.com/github/isuru0x01/Notebooks/blob/main/Modular_Design_ML_Framework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install tensorflow



In [48]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from transformers import DistilBertTokenizer, TFDistilBertModel
import numpy as np

In [3]:
class TokenizerModule:
    def __init__(self, model_name='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)

    def tokenize(self, texts):
        return self.tokenizer(texts, return_tensors='tf', padding=True, truncation=True)

In [26]:
class BertEmbeddingModule:
    def __init__(self, model_name='bert-base-uncased'):
        self.model = TFBertModel.from_pretrained(model_name)

    def get_embeddings(self, inputs):
        outputs = self.model(inputs)
        return outputs.last_hidden_state

In [6]:
class SequenceModelingModule:
    def __init__(self):
        self.model = self.build_model()

    def build_model(self):
        input_layer = tf.keras.layers.Input(shape=(None, 768))  # Adjust based on embedding size
        lstm_layer = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(input_layer)
        dense_layer = tf.keras.layers.Dense(128, activation='relu')(lstm_layer)
        output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(dense_layer)
        model = tf.keras.models.Model(inputs=input_layer, outputs=output_layer)
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model

    def train(self, X, y, epochs=3):
        self.model.fit(X, y, epochs=epochs)

    def predict(self, X):
        return self.model.predict(X)


In [31]:
class NLPFramework:
    def __init__(self, tokenizer_model='bert-base-uncased', embedding_model='bert-base-uncased'):
        self.tokenizer = TokenizerModule(tokenizer_model)
        self.embedder = BertEmbeddingModule(embedding_model)  # Default to BERT
        self.sequence_model = SequenceModelingModule()

    def set_embedding_module(self, embedding_module):
        self.embedder = embedding_module

    def process_texts(self, texts):
        tokenized_inputs = self.tokenizer.tokenize(texts)
        embeddings = self.embedder.get_embeddings(tokenized_inputs)
        return embeddings

    def train(self, texts, labels, epochs=3):
        embeddings = self.process_texts(texts)
        # Convert labels to a NumPy array
        labels = np.array(labels)
        self.sequence_model.train(embeddings, labels, epochs)

    def predict(self, texts):
        embeddings = self.process_texts(texts)
        return self.sequence_model.predict(embeddings)


In [42]:
# Dummy data
texts = ["Hello, how are you?", "I am fine, thank you.", "What is your name?"]
labels = [0, 1, 0]  # Dummy labels for binary classification

# Initialize and train the framework
framework = NLPFramework()
framework.train(texts, labels, 5)

# Predict on new data
new_texts = ["What time is it?", "I need some help!" , "Everything, Good, thank you!."]
predictions = framework.predict(new_texts)
print(predictions)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




[[0.04522345]
 [0.16463251]
 [0.6946345 ]]


In [54]:
class DistilBertEmbeddingModule:
    def __init__(self, model_name='distilbert-base-uncased'):
        self.model = TFDistilBertModel.from_pretrained(model_name)

    def get_embeddings(self, inputs):
        # Remove 'token_type_ids' from the input if it exists
        if 'token_type_ids' in inputs:
            inputs.pop('token_type_ids')
        outputs = self.model(**inputs)
        return outputs.last_hidden_state

In [56]:
texts = ["Hello, how are you?", "I am fine, thank you.", "What is your name?"]
labels = [0, 1, 0]  # Dummy labels for binary classification

# Initialize framework with GPT-2 embeddings
distilbert_embedder = DistilBertEmbeddingModule(model_name='distilbert-base-uncased')
framework.set_embedding_module(distilbert_embedder)


# Train and test with GPT-2 embeddings
framework.train(texts, labels, 5)
predictions = framework.predict(new_texts)
print(predictions)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[[6.2223140e-04]
 [1.1963818e-01]
 [7.5153041e-01]]
