In [13]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import multiprocessing


In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
input_folder = "/content/drive/MyDrive/Data folder"
output_folder = "/content/drive/MyDrive/Output folder"

In [33]:
class BioBERTEmbeddingProcessor:
    def __init__(self, model_name="dmis-lab/biobert-base-cased-v1.1"):
        """
        Initialize BioBERT tokenizer and model for embedding generation

        Args:
            model_name (str): Hugging Face model identifier
        """
        # Device configuration
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Load tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
        self.model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1").to(self.device)
        self.model.eval()

        # Label encoder to manage categorical labels
        self.label_encoder = LabelEncoder()

    def generate_word_embeddings(self, sentence): # Added this function inside the class
        """
        Generate word-level embeddings for a given sentence

        Args:
            sentence (str): Input sentence

        Returns:
            list: Word embeddings
        """
        # Tokenize the sentence
        tokens = self.tokenizer(sentence, return_tensors="pt",
                                padding=True,
                                truncation=True,
                                max_length=512).to(self.device)

        # Generate embeddings
        with torch.no_grad():
            outputs = self.model(**tokens)

        # Extract embeddings
        embeddings = outputs.last_hidden_state.squeeze(0)
        token_ids = tokens['input_ids'].squeeze(0)
        words = self.tokenizer.convert_ids_to_tokens(token_ids)

        # Filter out special tokens
        results = []
        for i, word in enumerate(words):
            if word not in ["[CLS]", "[SEP]", "[PAD]"]:
                results.append({
                    "word": word,
                    "embedding": embeddings[i].cpu().tolist()
                })

        return results

    def process_csv_files(self, input_folder, output_folder, num_workers=None): # Added this function inside the class
        """
        Process CSV files in parallel to generate embeddings

        Args:
            input_folder (str): Folder containing input CSV files
            output_folder (str): Folder to save processed embeddings
            num_workers (int, optional): Number of parallel workers
        """
        # Ensure output folder exists
        os.makedirs(output_folder, exist_ok=True)

        # Get list of CSV files
        csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

        # Use multiprocessing to speed up processing
        if num_workers is None:
            num_workers = max(1, multiprocessing.cpu_count() - 1)

        with multiprocessing.Pool(num_workers) as pool:
            # Prepare arguments for each file
            args = [(os.path.join(input_folder, file),
                     os.path.join(output_folder, file),
                     self) for file in csv_files]

             # Process files in parallel
            pool.starmap(self._process_single_file, args)

    def _process_single_file(self, input_path, output_path, processor):
        """
        Process a single CSV file

        Args:
            input_path (str): Path to input CSV
            output_path (str): Path to output CSV
            processor (BioBERTEmbeddingProcessor): Processor instance
        """
        try:
            # Read input CSV
            df = pd.read_csv(input_path)

            # Prepare output data
            all_embeddings = []

            # Process each row
            for _, row in df.iterrows():
                sentence = row['Sentence']
                label = row['Label']

                # Generate word embeddings
                word_embeddings = processor.generate_word_embeddings(sentence)

                # Collect embeddings with labels
                for item in word_embeddings:
                    all_embeddings.append({
                        "Word": item['word'],
                        "Embedding": item['embedding'],
                        "Label": label
                    })

            # Save processed data
            output_df = pd.DataFrame(all_embeddings)
            output_df.to_csv(output_path, index=False)
            print(f"Processed {input_path} -> {output_path}")

        except Exception as e:
            print(f"Error processing {input_path}: {e}")


In [34]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout=0.5):
        """
        Bidirectional LSTM Classifier

        Args:
            input_dim (int): Embedding dimension
            hidden_dim (int): LSTM hidden layer dimension
            output_dim (int): Number of output classes
            dropout (float): Dropout rate
        """
        super(BiLSTMClassifier, self).__init__()

        # Bidirectional LSTM
        self.lstm = nn.LSTM(
            input_dim,
            hidden_dim,
            batch_first=True,
            bidirectional=True
        )

        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)

        # Fully connected output layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        """
        Forward pass through the network

        Args:
            x (torch.Tensor): Input embeddings

        Returns:
            torch.Tensor: Output logits
        """
        # LSTM processing
        lstm_out, _ = self.lstm(x)

        # Take the last hidden state
        lstm_out = lstm_out[:, -1, :]

        # Apply dropout
        lstm_out = self.dropout(lstm_out)

        # Generate output
        output = self.fc(lstm_out)

        return output


In [46]:
class BioBERTBiLSTMClassificationPipeline:
    def __init__(self, embedding_dim=768, hidden_dim=128):
        """
        End-to-end BioBERT BiLSTM Classification Pipeline

        Args:
            embedding_dim (int): Dimension of BioBERT embeddings
            hidden_dim (int): LSTM hidden layer dimension
        """
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.embedding_processor = BioBERTEmbeddingProcessor()
        self.model = None
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim

    def prepare_data(self, csv_files):
        """
        Prepare data for training

        Args:
            csv_files (list): List of CSV file paths

        Returns:
            tuple: Prepared embeddings and labels
        """
        all_embeddings = []
        all_labels = []

        for filename in csv_files:
            df = pd.read_csv(filename)

            # CHANGED: Iterate through groups based on 'Sentence' (which is now 'Word')
            for sentence, group in df.groupby('Word'):
                # Extract embeddings and label for the group
                sentence_embeddings = group['Embedding'].apply(eval).tolist() # Convert string representation to list
                label = group['Label'].iloc[0]  # Get the label (assuming it's the same for all rows in the group)

                all_embeddings.append(sentence_embeddings)
                all_labels.append(label)

        # Encode labels
        all_labels = self.embedding_processor.label_encoder.fit_transform(all_labels)

        return all_embeddings, all_labels



    def train_model(self, csv_files, test_size=0.2, batch_size=32, epochs=10):
        """
        Train BiLSTM model

        Args:
            csv_files (list): List of training CSV files
            test_size (float): Proportion of test data
            batch_size (int): Training batch size
            epochs (int): Number of training epochs

        Returns:
            float: Best validation accuracy
        """
        # Prepare data
        embeddings, labels = self.prepare_data(csv_files)

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            embeddings, labels,
            test_size=test_size,
            random_state=42
        )

        # Pad sequences to uniform length
        max_len = max(len(seq) for seq in embeddings)
        X_train = [seq + [[0] * self.embedding_dim] * (max_len - len(seq)) for seq in X_train]
        X_test = [seq + [[0] * self.embedding_dim] * (max_len - len(seq)) for seq in X_test]

        # Convert to tensors
        X_train = torch.tensor(X_train, dtype=torch.float32)
        X_test = torch.tensor(X_test, dtype=torch.float32)
        y_train = torch.tensor(y_train, dtype=torch.long)
        y_test = torch.tensor(y_test, dtype=torch.long)

        # Create data loaders
        train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
        test_dataset = torch.utils.data.TensorDataset(X_test, y_test)

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size)

        # Initialize model
        self.model = BiLSTMClassifier(
            input_dim=self.embedding_dim,
            hidden_dim=self.hidden_dim,
            output_dim=len(set(labels))
        ).to(self.device)

        # Loss and optimizer
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.model.parameters())
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

        # Training loop
        best_accuracy = 0
        for epoch in range(epochs):
            self.model.train()
            train_loss = 0

            for batch_x, batch_y in train_loader:
                batch_x, batch_y = batch_x.to(self.device), batch_y.to(self.device)

                # Zero gradients
                optimizer.zero_grad()

                # Forward pass
                outputs = self.model(batch_x)
                loss = criterion(outputs, batch_y)

                # Backward pass
                loss.backward()
                optimizer.step()

                train_loss += loss.item()

            # Validation
            self.model.eval()
            correct = 0
            total = 0

            with torch.no_grad():
                for batch_x, batch_y in test_loader:
                    batch_x, batch_y = batch_x.to(self.device), batch_y.to(self.device)
                    outputs = self.model(batch_x)
                    _, predicted = torch.max(outputs.data, 1)

                    total += batch_y.size(0)
                    correct += (predicted == batch_y).sum().item()

            # Update learning rate
            scheduler.step()

            # Track best model
            accuracy = correct / total
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                torch.save(self.model.state_dict(), 'best_bilstm_model.pth')
                joblib.dump(self.embedding_processor.label_encoder, 'label_encoder.pkl')

            print(f"Epoch {epoch+1}/{epochs}: Loss = {train_loss/len(train_loader):.4f}, Accuracy = {accuracy:.4f}")

        return best_accuracy

    def predict_sentences(self, sentences):
        """
        Predict labels for multiple sentences

        Args:
            sentences (list): List of sentences to predict

        Returns:
            list: Predicted labels
        """
        # Ensure model is loaded
        if self.model is None:
            self.model = BiLSTMClassifier(
                input_dim=self.embedding_dim,
                hidden_dim=self.hidden_dim,
                output_dim=len(self.embedding_processor.label_encoder.classes_)
            ).to(self.device)
            self.model.load_state_dict(torch.load('best_bilstm_model.pth'))

            # Reload label encoder
            self.embedding_processor.label_encoder = joblib.load('label_encoder.pkl')

        self.model.eval()
        predictions = []

        for sentence in sentences:
            # Generate word embeddings
            word_embeddings = self.embedding_processor.generate_word_embeddings(sentence)
            sentence_embeddings = [item['embedding'] for item in word_embeddings]

            # Pad sequence
            max_len = 10  # Or whatever max length you expect
            sentence_embeddings = sentence_embeddings[:max_len] + [[0] * self.embedding_dim] * (max_len - len(sentence_embeddings))

            # Convert to tensor
            sentence_tensor = torch.tensor(sentence_embeddings).unsqueeze(0).float().to(self.device)

            # Predict
            with torch.no_grad():
                output = self.model(sentence_tensor)
                predicted_label_idx = output.argmax(dim=1).item()
                predicted_label = self.embedding_processor.label_encoder.inverse_transform([predicted_label_idx])[0]
                predictions.append(predicted_label)

        return predictions

In [48]:
# Example Usage
def main():
    # Initialize the pipeline
    pipeline = BioBERTBiLSTMClassificationPipeline()

    # Process CSV files (optional)
    # CHANGED: Provide the correct path to your input folder if it's not 'input_embeddings'
    # For example, if your CSV files are in the 'data' folder:
    pipeline.embedding_processor.process_csv_files(
        input_folder=input_folder,  # Changed to use the user-defined input_folder
        output_folder=output_folder # Changed to use the user-defined output_folder
    )

    # Train the model
    csv_files = [os.path.join(output_folder, f) for f in os.listdir(output_folder) if f.endswith('.csv')] # Changed to use the output_folder and list comprehension
    pipeline.train_model(csv_files, epochs=10)

    # Predict for multiple sentences
    test_sentences = [

        "A gland is an organ that makes one or more substances, such as hormones, digestive juices, sweat or tears.",
        "Endocrine glands release hormones directly into the bloodstream.",
        "endocrine system is an elaborate network of glands and hormones..",
        "A thyroidectomy is the surgical removal of your entire thyroid gland.",

    ]
    predictions = pipeline.predict_sentences(test_sentences)

    # Print predictions
    for sentence, prediction in zip(test_sentences, predictions):
        print(f"Sentence: {sentence}")
        print(f"Predicted Label: {prediction}\n")

if __name__ == "__main__":
    main()

Processed /content/drive/MyDrive/Data folder/blood_pressure_control.csv -> /content/drive/MyDrive/Output folder/blood_pressure_control.csv
Processed /content/drive/MyDrive/Data folder/heart_cycle.csv -> /content/drive/MyDrive/Output folder/heart_cycle.csv
Processed /content/drive/MyDrive/Data folder/circulatory_flow.csv -> /content/drive/MyDrive/Output folder/circulatory_flow.csv
Processed /content/drive/MyDrive/Data folder/dehydration_response.csv -> /content/drive/MyDrive/Output folder/dehydration_response.csv
Processed /content/drive/MyDrive/Data folder/exercise_response.csv -> /content/drive/MyDrive/Output folder/exercise_response.csv
Processed /content/drive/MyDrive/Data folder/fluid_balance.csv -> /content/drive/MyDrive/Output folder/fluid_balance.csv
Processed /content/drive/MyDrive/Data folder/kidney_function.csv -> /content/drive/MyDrive/Output folder/kidney_function.csv
Processed /content/drive/MyDrive/Data folder/lung_perfusion_balance.csv -> /content/drive/MyDrive/Output fo