In [None]:
!sudo apt-get install tesseract-ocr -y
!pip install pytesseract

import pytesseract
from PIL import Image
import os
import csv
import cv2
import numpy as np
import torch
import tensorflow as tf
from transformers import TFRobertaForSequenceClassification, RobertaTokenizer, ViTForImageClassification, ViTConfig, ViTImageProcessor

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Install Tesseract dependencies
!apt install tesseract-ocr
!apt install libtesseract-dev

# Download the English language models for Tesseract
!wget https://github.com/tesseract-ocr/tessdata/blob/main/eng.traineddata?raw=true


# Make a directory for tessdata and move downloaded files there
!mkdir -p tessdata
!mv eng.traineddata?raw=true tessdata/eng.traineddata


# Set the TESSDATA_PREFIX environment variable
os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/4.00/tessdata'
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'



# Function to preprocess the image
def preprocess_final(im):
    im = cv2.bilateralFilter(im, 5, 55, 60)
    im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
    _, im = cv2.threshold(im, 240, 255, 1)
    return im

# Function to extract text from an image
def extract_text(image_path, custom_config=r"--oem 3 --psm 11 -c tessedit_char_whitelist= 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz '"):
    img = cv2.imread(image_path)
    img = preprocess_final(img)
    text = pytesseract.image_to_string(img, lang='eng', config=custom_config)
    return text.replace('\n', ' ')
















# Path to the folder containing images and CSV files

image_folder_path = '/content/drive/My Drive/Colab Notebooks/public-data/image/net'
extracted_text_csv = '/content/drive/My Drive/Colab Notebooks/public-data/extracted_text.csv'
roberta_predictions_csv = '/content/drive/My Drive/Colab Notebooks/public-data/roberta_predictions.csv'
vit_predictions_csv = '/content/drive/My Drive/Colab Notebooks/public-data/vit_predictions.csv'


# Load RoBERTa model
roberta_model = TFRobertaForSequenceClassification.from_pretrained('/content/drive/My Drive/Colab Notebooks/public-data/models/roberta-cyberbullying-classifier')
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Load ViT model
model_path = '/content/drive/My Drive/Colab Notebooks/public-data/models/my_vit_model.pth'
config = ViTConfig.from_pretrained('google/vit-base-patch16-224-in21k', num_labels=4)
vit_model = ViTForImageClassification(config)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vit_model.load_state_dict(torch.load(model_path, map_location=device))
vit_model.to(device)

# Initialize the feature extractor for ViT
feature_extractor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')

# Define function to get label from RoBERTa model prediction
def predict_text(text, tokenizer, model):
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='tf',
    )
    input_ids = tf.convert_to_tensor([encoded['input_ids'][0]])
    attention_masks = tf.convert_to_tensor([encoded['attention_mask'][0]])

    pred = model.predict([input_ids, attention_masks])
    return np.argmax(pred.logits, axis=1)[0]

# Function to get label from ViT model prediction
def predict_image(image_path, feature_extractor, model):
    image = Image.open(image_path).convert("RGB")
    processed_img = feature_extractor(images=image, return_tensors="pt")['pixel_values']
    with torch.no_grad():
        outputs = model(processed_img)
    return torch.argmax(torch.nn.functional.softmax(outputs.logits, dim=1), dim=1).item()

# Extract text from each image and save it in CSV files along with model predictions
with open(extracted_text_csv, 'w', newline='', encoding='utf-8') as text_file, \
     open(roberta_predictions_csv, 'w', newline='', encoding='utf-8') as roberta_file, \
     open(vit_predictions_csv, 'w', newline='', encoding='utf-8') as vit_file:

    text_writer = csv.writer(text_file)
    roberta_writer = csv.writer(roberta_file)
    vit_writer = csv.writer(vit_file)

    text_writer.writerow(['Image Name', 'Extracted Text'])
    roberta_writer.writerow(['Image Name', 'Extracted Text', 'RoBERTa Prediction'])
    vit_writer.writerow(['Image Name', 'ViT Prediction'])

    for image_name in os.listdir(image_folder_path):
        image_path = os.path.join(image_folder_path, image_name)
        if os.path.isfile(image_path):
            try:
                text = extract_text(image_path)
                text_label = predict_text(text, roberta_tokenizer, roberta_model) if text else None
                image_label = predict_image(image_path, feature_extractor, vit_model)

                text_writer.writerow([image_name, text])
                roberta_writer.writerow([image_name, text, text_label])
                vit_writer.writerow([image_name, image_label])

                print(f"Processed {image_name}: Text label: {text_label}, Image label: {image_label}")
            except Exception as e:
                print(f"Error processing {image_name}: {e}")

print("Text extraction and model testing complete. Data saved.")

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 0s (9,886 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debc

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at /content/drive/My Drive/Colab Notebooks/public-data/models/roberta-cyberbullying-classifier.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Processed 1.jpg: Text label: 0, Image label: 1
Processed 2_3.jpg: Text label: 0, Image label: 3
Processed 0_1.jpg: Text label: 0, Image label: 0
Processed download.jpg: Text label: 0, Image label: 0
Processed 2_1.jpg: Text label: 0, Image label: 0
Processed images (2).jpg: Text label: 0, Image label: 2
Processed images.jpg: Text label: 0, Image label: 3
Processed 0106.png: Text label: 0, Image label: 2
Processed depositphotos_635595200-stock-photo-portrait-hand-middle-finger-business.jpg: Text label: 0, Image label: 2
Processed 2.jpg: Text label: 0, Image label: 0
Processed images (4).jpg: Text label: 0, Image label: 0
Processed 1106.jpg: Text label: 0, Image label: 2
Processed 0.jpg: Text label: 0, Image label: 3
Processed images (1).jpg: Text label: 0, Image label: 2
Processed 1_1.jpg: Text label: 0, Image label: 1
Processed test.jpg: Text label: 0, Image label: 2
Processed images (3).jpg: Text label: 0, Image label: 0
Processed download (2).jpg: Text label: 0, Image label: 0
Process

## **Single Image Check**

In [None]:

from PIL import Image
import numpy as np
import cv2
import pytesseract
from urllib.request import urlopen

# Manually set the path or URL here
path = '/content/drive/My Drive/Colab Notebooks/public-data/image/net/test.jpg'
url = None  # Replace with your image URL or set to None

# Set pytesseract command
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

# Function to preprocess the image
def preprocess_final(im):
    im = cv2.bilateralFilter(im, 5, 55, 60)
    im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
    _, im = cv2.threshold(im, 240, 255, 1)
    return im

# Load image from path or URL
if path:
    img = np.array(Image.open(path))
elif url:
    img = np.array(Image.open(urlopen(url)))
else:
    raise ValueError("No path or URL provided")

# Preprocess and extract text from image
img = preprocess_final(img)
custom_config = r"--oem 3 --psm 11 -c tessedit_char_whitelist= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ '"
text = pytesseract.image_to_string(img, lang='eng', config=custom_config)

print('-'*40)
print("Meme Content: \n\n", text.replace('\n', ''))
print('-'*40)


----------------------------------------
Meme Content: 

 Fuckouitch
----------------------------------------
