In [None]:
!sudo apt-get install tesseract-ocr -y
!pip install pytesseract



import cv2
import os
import torch
import numpy as np
from PIL import Image
import pytesseract
import tensorflow as tf
from transformers import TFRobertaForSequenceClassification, RobertaTokenizer, ViTForImageClassification, ViTImageProcessor
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Set Tesseract path
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

# Load RoBERTa tokenizer (common for both models)
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Load ViT model
vit_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
vit_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224-in21k', num_labels=4)

# Load RoBERTa model
text_model = TFRobertaForSequenceClassification.from_pretrained('/content/drive/My Drive/Colab Notebooks/private-data/models/roberta-cyberbullying-classifier')

# Load Image model
image_model_path = '/content/drive/My Drive/Colab Notebooks/private-data/models/best_vit_model.pth'
vit_model.load_state_dict(torch.load(image_model_path, map_location=torch.device('cpu')))

# Define function to preprocess the image
def preprocess_final(im):
    im = cv2.bilateralFilter(im, 5, 55, 60)
    im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
    _, im = cv2.threshold(im, 240, 255, 1)
    return im

# Define function to extract text from an image
def extract_text(image_path, custom_config=r"--oem 3 --psm 11 -c tessedit_char_whitelist= 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz '"):
    img = cv2.imread(image_path)
    img = preprocess_final(img)
    text = pytesseract.image_to_string(img, lang='eng', config=custom_config)
    return text.replace('\n', ' ')

# Define function to classify text using RoBERTa
def classify_text(text):
    inputs = roberta_tokenizer.encode_plus(
        text, add_special_tokens=True, max_length=512,
        padding='max_length', truncation=True, return_tensors="tf"
    )
    roberta_prediction = text_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    roberta_probs = tf.nn.softmax(roberta_prediction.logits, axis=1)
    text_class = np.argmax(roberta_probs, axis=1)[0]
    return text_class

# Define function to classify image using ViT
def classify_image(image_path):
    image = Image.open(image_path).convert("RGB")
    input_tensor = vit_processor(images=image, return_tensors="pt")['pixel_values']
    with torch.no_grad():
        outputs = vit_model(input_tensor)
        vit_probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        image_class = torch.argmax(vit_probs, dim=1).item()
    return image_class

# Define function for late fusion
def late_fusion(text_class, image_class):
    if text_class == image_class:
        if text_class == 0:
            return "Input does not contain any Cyber-bullying."
        else:
            return f"Input contains this class {text_class} of cyberbullying."
    else:
        return f"Input contains cyberbullying. Text label is: {text_class} and Image label is: {image_class}"

# Main function to handle input and perform classification
def process_input(image_path):
    extracted_text = extract_text(image_path)
    if extracted_text:
        text_class = classify_text(extracted_text)
    else:
        text_class = None
    image_class = classify_image(image_path)
    if text_class is not None:
        fusion_message = late_fusion(text_class, image_class)
    else:
        fusion_message = "No text found to classify."
    return {
        'extracted_text': extracted_text,
        'text_label': f"Text label: {text_class}" if text_class is not None else "No text prediction",
        'image_label': f"Image label: {image_class}",
        'fusion_message': fusion_message
    }

# Example usage
image_path =  '/content/drive/My Drive/Colab Notebooks/public-data/image/net/test.jpg'
results = process_input(image_path)
print(results)


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 2s (2,924 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debc

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at /content/drive/My Drive/Colab Notebooks/private-data/models/roberta-cyberbullying-classifier.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


{'extracted_text': 'Fuck  ou  itch \x0c', 'text_label': 'Text label: 1', 'image_label': 'Image label: 2', 'fusion_message': 'Input contains cyberbullying. Text label is: 1 and Image label is: 2'}


In [None]:
def process_input(image_path, correct_text_label, correct_image_label):
    extracted_text = extract_text(image_path)
    text_class = classify_text(extracted_text) if extracted_text else None
    image_class = classify_image(image_path)

    # Check correctness
    text_correct = (text_class == correct_text_label) if text_class is not None else False
    image_correct = (image_class == correct_image_label)

    # Calculate accuracy
    accuracy_count = sum([text_correct, image_correct])
    total_tests = 2  # Since we're evaluating two things: text and image for multi-modal
    average_accuracy = accuracy_count / total_tests

    # Prepare output message
    fusion_message = late_fusion(text_class, image_class) if text_class is not None else "No text found to classify."

    return {
        'extracted_text': extracted_text,
        'text_label': f"Text label: {text_class}" if text_class is not None else "No text prediction",
        'image_label': f"Image label: {image_class}",
        'fusion_message': fusion_message,
        'average_accuracy': average_accuracy
    }


In [None]:
# Known correct labels for demonstration purposes
correct_text_label = 1
correct_image_label = 1

# Path to the image
image_path = '/content/drive/My Drive/Colab Notebooks/public-data/image/net/test.jpg'

# Example usage with the correct labels provided
results = process_input(image_path, correct_text_label, correct_image_label)
print(results)


{'extracted_text': 'Fuck  ou  itch \x0c', 'text_label': 'Text label: 1', 'image_label': 'Image label: 2', 'fusion_message': 'Input contains cyberbullying. Text label is: 1 and Image label is: 2', 'average_accuracy': 0.5}
