<a href="https://colab.research.google.com/github/hrishabh1919/vani-Setu-AI/blob/main/vaniAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install datasets

In [None]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from datasets import load_dataset

In [None]:
class MultilingualNLPModel:
    def __init__(self, model_name='xlm-roberta-base', num_labels=5):
        print("Initializing model...")
        try:
            # Load dataset
            self.dataset = load_dataset("ai4bharat/INCLUDE")

            # Initialize tokenizer
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)

            # Initialize model
            self.model = AutoModelForSequenceClassification.from_pretrained(
                model_name,
                num_labels=num_labels
            )

            # Set device
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            print(f"Using device: {self.device}")
            self.model.to(self.device)

        except Exception as e:
            print(f"Error during initialization: {e}")
            raise

In [None]:

def preprocess_data(self, text_column='text', label_column='label'):
        print("Preprocessing data...")
        try:
            def tokenize_function(examples):
                return self.tokenizer(
                    examples[text_column],
                    padding=True,
                    truncation=True,
                    max_length=512
                )

            tokenized_datasets = self.dataset.map(
                tokenize_function,
                batched=True
            )

            tokenized_datasets.set_format(
                'torch',
                columns=['input_ids', 'attention_mask', label_column]
            )

            return tokenized_datasets

        except Exception as e:
            print(f"Error in preprocessing: {e}")
            raise

In [None]:
def create_dataloaders(self, batch_size=16):
        print("Creating dataloaders...")
        try:
            tokenized_datasets = self.preprocess_data()

            train_dataset = tokenized_datasets['train']
            val_dataset = tokenized_datasets['validation']

            train_loader = DataLoader(
                train_dataset,
                batch_size=batch_size,
                shuffle=True
            )

            val_loader = DataLoader(
                val_dataset,
                batch_size=batch_size
            )

            return train_loader, val_loader

        except Exception as e:
            print(f"Error creating dataloaders: {e}")
            raise

In [None]:
def train(self, epochs=3, learning_rate=2e-5):
        # Prepare data loaders
        train_loader, val_loader = self.create_dataloaders()

        # Optimizer and loss
        optimizer = torch.optim.AdamW(
            self.model.parameters(),
            lr=learning_rate
        )

        # Training loop
        for epoch in range(epochs):
            self.model.train()
            total_loss = 0

            for batch in train_loader:
                optimizer.zero_grad()

                # Move batch to device
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['label'].to(self.device)

                # Forward pass
                outputs = self.model(
                    input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                total_loss += loss.item()

                # Backward pass
                loss.backward()
                optimizer.step()

            # Validation
            self.model.eval()
            val_loss = 0
            with torch.no_grad():
                for batch in val_loader:
                    input_ids = batch['input_ids'].to(self.device)
                    attention_mask = batch['attention_mask'].to(self.device)
                    labels = batch['label'].to(self.device)

                    outputs = self.model(
                        input_ids,
                        attention_mask=attention_mask,
                        labels=labels
                    )
                    val_loss += outputs.loss.item()

            print(f"Epoch {epoch+1}: Train Loss = {total_loss/len(train_loader)}, "
                  f"Val Loss = {val_loss/len(val_loader)}")

In [None]:
def train_model(self, epochs=3, learning_rate=2e-5):
        print("Starting training...")
        try:
            train_loader, val_loader = self.create_dataloaders()

            optimizer = torch.optim.AdamW(
                self.model.parameters(),
                lr=learning_rate
            )

            for epoch in range(epochs):
                print(f"\nEpoch {epoch+1}/{epochs}")

                # Training phase
                self.model.train()
                total_train_loss = 0

                for batch_idx, batch in enumerate(train_loader):
                    optimizer.zero_grad()

                    input_ids = batch['input_ids'].to(self.device)
                    attention_mask = batch['attention_mask'].to(self.device)
                    labels = batch['label'].to(self.device)

                    outputs = self.model(
                        input_ids,
                        attention_mask=attention_mask,
                        labels=labels
                    )

                    loss = outputs.loss
                    total_train_loss += loss.item()

                    loss.backward()
                    optimizer.step()

                    if (batch_idx + 1) % 10 == 0:
                        print(f"Batch {batch_idx + 1}/{len(train_loader)}, "
                              f"Loss: {loss.item():.4f}")

                avg_train_loss = total_train_loss / len(train_loader)

                # Validation phase
                self.model.eval()
                total_val_loss = 0

                with torch.no_grad():
                    for batch in val_loader:
                        input_ids = batch['input_ids'].to(self.device)
                        attention_mask = batch['attention_mask'].to(self.device)
                        labels = batch['label'].to(self.device)

                        outputs = self.model(
                            input_ids,
                            attention_mask=attention_mask,
                            labels=labels
                        )
                        total_val_loss += outputs.loss.item()

                avg_val_loss = total_val_loss / len(val_loader)
                print(f"Epoch {epoch+1} Summary:")
                print(f"Average Training Loss: {avg_train_loss:.4f}")
                print(f"Average Validation Loss: {avg_val_loss:.4f}")

        except Exception as e:
            print(f"Error during training: {e}")
            raise

In [None]:
def predict(self, text):
        print("Making prediction...")
        try:
            inputs = self.tokenizer(
                text,
                return_tensors='pt',
                padding=True,
                truncation=True,
                max_length=512
            ).to(self.device)

            self.model.eval()
            with torch.no_grad():
                outputs = self.model(**inputs)
                predictions = torch.softmax(outputs.logits, dim=1)

            return predictions

        except Exception as e:
            print(f"Error during prediction: {e}")
            raise

In [None]:
def main():
        try:
          print("Creating model instance...")
          model = MultilingualNLPModel()

          print("Starting training process...")
          model.train_model(epochs=3)

          print("Making a test prediction...")
          sample_text = "Example sign language text"
          predictions = model.predict(sample_text)
          print(f"Predictions: {predictions}")

        except Exception as e:
          print(f"Error in main execution: {e}")
          raise

if __name__ == "__main__":
      main()

Creating model instance...
Initializing model...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
Starting training process...
Error in main execution: 'MultilingualNLPModel' object has no attribute 'train_model'


AttributeError: 'MultilingualNLPModel' object has no attribute 'train_model'