In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
!pip install nltk
!pip install pandas
!pip install torch
!pip install evaluate
!pip install datasets
!pip install transformers
!pip install scikit-learn
!pip install accelerate
!pip install flask
!pip install pyngrok
!pip install flask-ngrok



In [26]:
import re
import random
import string

import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from transformers import pipeline
from transformers import Trainer
from transformers import TrainingArguments
from transformers import DataCollatorWithPadding
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import AutoModelForTokenClassification

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [19]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7b66d3623750>

# **1. Mixed Language Detection**

In [20]:
# Load the fine-tuned model and tokenizer
# lang_detect_model_path = "/content/drive/MyDrive/Colab Notebooks/ner_project/models/Code-Mixed-mBERT-Fine-Tuned-Language-Detection"
lang_detect_model_path = "/content/drive/MyDrive/Colab Notebooks/ner_project/models/Code-Mixed-mBERT-Fine-Tuned-Language-Detection-Latest"

In [21]:
def language_detect(sentence):
    # Load the model
    lang_detect_tokenizer = BertTokenizer.from_pretrained(lang_detect_model_path)
    lang_detect_model = BertForSequenceClassification.from_pretrained(lang_detect_model_path, num_labels=5)
    # Set the model to evaluation mode
    lang_detect_model.eval()
    # Tokenize the sentence
    inputs = lang_detect_tokenizer(sentence, padding=True, truncation=True, max_length=128, return_tensors="pt")

    # Make a prediction
    with torch.no_grad():
        outputs = lang_detect_model(**inputs)

    # Get the predicted label
    predicted_class = torch.argmax(outputs.logits, dim=1).item()

    # Decode the predicted class (assuming labels represent specific categories)
    # Replace with your actual label mapping
    label_mapping = {
        0: "Hindi",
        1: "Marathi",
        2: "Gujarati",
        3: "Telugu",
        4: "English"
    }

    decoded_label = label_mapping.get(predicted_class, "Others")
    print(f"Language Detection Predicted class: {predicted_class}, Decoded Label: {decoded_label}")

    return decoded_label

# **2. Translate the text to Dominant Language**

In [22]:
# Loading the model
translation_model_path = "/content/drive/MyDrive/Colab Notebooks/ner_project/models/Language-Detection-NLLB"

In [23]:
language_mapping = {
    "Hindi": "hin_Deva",
    "Marathi": "mar_Deva",
    "Gujarati": "guj_Gujr",
    "Telugu": "tel_Telu",
    "English": "eng_Latn"
}

def translate_to(paragraph, target_lang=language_mapping['English']):
    translation_tokenizer = AutoTokenizer.from_pretrained(translation_model_path)
    translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_path)
    sentences = sent_tokenize(paragraph)

    translated_sentences = []
    for sentence in sentences:
        inputs = translation_tokenizer(sentence, return_tensors="pt")
        # Use tokenizer.convert_tokens_to_ids instead of tokenizer.lang_code_to_id
        translated_tokens = translation_model.generate(**inputs, forced_bos_token_id=translation_tokenizer.convert_tokens_to_ids(target_lang))
        translated_sentence = translation_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
        translated_sentences.append(translated_sentence)
    return " ".join(translated_sentences)

In [25]:
sentence = '''Mujhe kal office jaana hai, but I don’t feel like it.'''
target_language = language_detect(sentence)
print(target_language)
translated_sent = translate_to(sentence, language_mapping[target_language])
print(language_mapping[target_language])
print(translated_sent)

OSError: Incorrect path_or_model_id: '/content/drive/MyDrive/Colab Notebooks/ner_project/models/Code-Mixed-mBERT-Fine-Tuned-Language-Detection-Latest'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

# **3. NER on Predominant translated Text**

In [None]:
xlm_based_ner_path = "/content/drive/MyDrive/Colab Notebooks/ner_project/models/XML-Roberta-Large-Finetuned"
bert_based_ner_path = "/content/drive/MyDrive/Colab Notebooks/ner_project/models/Bert-Base-Multilingual-Cased-NER-hrl"
fine_tuned_ner_path = "/content/drive/MyDrive/Colab Notebooks/ner_project/models/IndicBERT-Fine-Tuned/ner_model/checkpoint-3849"

### 1. **Davlan/bert-base-multilingual-cased-ner-hrl**

In [None]:
# Reload the pipeline with the locally saved model
def bert_perform_ner_sentence(text):
    loaded_tokenizer = AutoTokenizer.from_pretrained(bert_based_ner_path)
    loaded_model = AutoModelForTokenClassification.from_pretrained(bert_based_ner_path)
    bert_based_ner_pipe = pipeline(
        "ner",
        model=loaded_model,
        tokenizer=loaded_tokenizer,
        device=0 if torch.cuda.is_available() else -1,
        model_kwargs={"torch_dtype": torch.float16} if torch.cuda.is_available() else {}
    )
    entities = bert_based_ner_pipe(text)
    formatted_entities = [{"entity": e["word"], "type": e["entity"]} for e in entities]
    return formatted_entities

# Example usage
translated_sent = "मुकेश अंबानी रिलायंस इंडस्ट्रीज के अध्यक्ष हैं।"
bert_perform_ner_sentence(translated_sent)

### 2. **xlm-roberta-large-finetuned-conll03-english**



In [None]:
def xlm_perform_ner_sentence(text):
    # Load from local directory
    loaded_tokenizer = AutoTokenizer.from_pretrained(xlm_based_ner_path)
    loaded_model = AutoModelForTokenClassification.from_pretrained(xlm_based_ner_path)
    # Reload the pipeline with the locally saved model
    ner_pipe = pipeline(
        "ner",
        model=loaded_model,
        tokenizer=loaded_tokenizer,
        device=0 if torch.cuda.is_available() else -1,
        model_kwargs={"torch_dtype": torch.float16} if torch.cuda.is_available() else {}
    )

    # Function to extract named entities
    entities = ner_pipe(text)
    formatted_entities = [
        {"entity": e["word"], "type": e["entity"]}
        for e in entities
    ]
    return formatted_entities

# Example usage
translated_sent = "Elon Musk is the CEO of Tesla and SpaceX."
xlm_perform_ner_sentence(translated_sent)


### 3. **Fine Tuned IndicBERT**



In [None]:
def group_tokens_by_word(input_text, offsets, word_ids, probs, id2label, uncertainty_threshold=0.5):
    grouped = {}
    for idx, word_id in enumerate(word_ids):
        if word_id is None:
            continue  # Skip special tokens
        token_offset = offsets[idx]  # [start, end]
        token_prob = probs[idx]      # probability vector for this token
        if word_id not in grouped:
            grouped[word_id] = {
                "start": token_offset[0],
                "end": token_offset[1],
                "probs": [token_prob],
                "token_indices": [idx]
            }
        else:
            grouped[word_id]["start"] = min(grouped[word_id]["start"], token_offset[0])
            grouped[word_id]["end"] = max(grouped[word_id]["end"], token_offset[1])
            grouped[word_id]["probs"].append(token_prob)
            grouped[word_id]["token_indices"].append(idx)

    word_groups = []
    for word_id in sorted(grouped.keys()):
        group = grouped[word_id]
        group_probs = torch.stack(group["probs"], dim=0)  # shape: (n_tokens, num_labels)
        avg_prob = torch.mean(group_probs, dim=0)          # shape: (num_labels,)
        max_prob, label_idx = torch.max(avg_prob, dim=0)
        chosen_label = id2label[label_idx.item()]
        # Fallback: if maximum probability is below the threshold, mark as "O"
        if max_prob.item() < uncertainty_threshold:
            chosen_label = "O"
        word_groups.append({
            "word_id": word_id,
            "start": group["start"],
            "end": group["end"],
            "avg_prob": avg_prob,
            "label": chosen_label,
            "token_indices": group["token_indices"]
        })
    return word_groups

def postprocess_labels(word_groups):
    final_labels = []
    for i, group in enumerate(word_groups):
        label = group["label"]
        # If the label is not "O", check previous label for continuity.
        if i > 0 and label != "O" and final_labels[-1] != "O":
            prev_entity = final_labels[-1].split("-")[-1]
            curr_entity = label.split("-")[-1]
            if prev_entity == curr_entity:
                # Force the current label to be I-<entity>
                label = "I-" + curr_entity
            else:
                # Otherwise, if current label starts with I- without continuity, force it to B-
                if label.startswith("I-"):
                    label = "B-" + curr_entity
        # Also, if the very first label is I- something, change it to B-
        if i == 0 and label.startswith("I-"):
            label = "B-" + label[2:]
        final_labels.append(label)
    return final_labels

def reconstruct_words(input_text, word_groups):
    words = []
    for group in word_groups:
        word_text = input_text[group["start"]:group["end"]]
        # Optionally, strip extraneous quotes or spaces.
        word_text = word_text.strip(' "')
        words.append(word_text)
    return words


def bert_finetuned_perform_ner_sentence(input_text):
    # Step 2: Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(fine_tuned_ner_path)
    model = AutoModelForTokenClassification.from_pretrained(fine_tuned_ner_path, local_files_only=True)

    # Step 3: Define label mapping
    id2label = {
        0: "O", 1: "B-PER", 2: "I-PER", 3: "B-ORG", 4: "I-ORG",
        5: "B-LOC", 6: "I-LOC", 7: "B-EVT", 8: "I-EVT",
        9: "B-PROD", 10: "I-PROD"
    }

    # Step 5: Tokenize the raw text with offset mappings and word_ids.
    # Do NOT pre-split the text.
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        return_offsets_mapping=True  # returns offsets for each token
    )
    # Extract offset mappings and word_ids from the fast tokenizer.
    offsets = inputs.pop("offset_mapping")[0].tolist()  # List of [start, end] pairs.
    word_ids = inputs.word_ids(batch_index=0)            # Maps each token to its originating word (or None).

    # Step 6: Perform prediction and compute probabilities.
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits[0]  # shape: (seq_len, num_labels)
    probs = torch.softmax(logits, dim=1)  # shape: (seq_len, num_labels)

    # Step 7: Group tokens by word.
    word_groups = group_tokens_by_word(input_text, offsets, word_ids, probs, id2label, uncertainty_threshold=0.5)
    # print(word_groups)

    # Reconstruct words exactly as in the original input.
    final_words = reconstruct_words(input_text, word_groups)

    # Get initial labels from each group.
    initial_labels = [group["label"] for group in word_groups]

    # Step 8: Post-process labels to enforce continuity and apply uncertainty fallback.
    final_labels = postprocess_labels(word_groups)
    formatted_entities = [{"entity": e, "type": l} for e, l in zip(final_words, final_labels)]
    return formatted_entities

translated_sent = "मुकेश अंबानी रिलायंस इंडस्ट्रीज के अध्यक्ष हैं।"
bert_finetuned_perform_ner_sentence(translated_sent)

# **4. Clubing all pieces together**

In [None]:
!ngrok config add-authtoken 2sofHDQPwfl8epeBRJE23u8zYUm_7tnntmjd6Xngc6G1d7uxW

In [None]:
!mkdir ./templates

In [None]:
!cp /content/drive/MyDrive/Colab\ Notebooks/ner_project/templates/index.html ./templates/

In [None]:
import time
import requests
from flask import Flask, request, jsonify, render_template
from pyngrok import ngrok

# Initialize Flask app
app = Flask(__name__)

def detect_language(text):
    res = language_detect(text)
    print(res)
    languages = ["english", "hindi", "marathi", "telugu", "gujarati"]
    return res if res.lower() in languages else "others"


def perform_ner(text, model_choice):
    if model_choice == "BERT Base Model":
        return bert_perform_ner_sentence(text)
    elif model_choice == "XLM-R Model":
        return xlm_perform_ner_sentence(text)
    elif model_choice == "Fine-Tuned BERT Model":
        return bert_finetuned_perform_ner_sentence(text)
    else:
        return []

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/process', methods=['POST'])
def process_text():
    data = request.get_json()
    text = data.get("text", "").strip()
    model_choice = data.get("model_choice", "Fine-Tuned BERT Model")  # Default model

    if not text:
        return jsonify({
            "detected_language": "Error",
            "translated_text": "Error: Empty input",
            "ner_results": []
        })

    detected_lang = detect_language(text)

    if detected_lang == "others":
        return jsonify({
            "detected_language": "Others",
            "translated_text": "Not supported",
            "ner_results": []
        })

    translated_text = translate_to(text, language_mapping[detected_lang])
    ner_results = perform_ner(translated_text, model_choice)  # Call function based on model selection

    return jsonify({
        "detected_language": detected_lang,
        "translated_text": translated_text,
        "ner_results": ner_results
    })

# Start the Flask app and expose it using ngrok
if __name__ == '__main__':
    public_url = ngrok.connect(5000).public_url
    print(f"Public URL: {public_url}")
    app.run(port=5000)



In [None]:
ngrok.kill()
ngrok.disconnect(public_url)