# Разработка модели детекции наклеек: Faster R-CNN

In [2]:
import torch
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

## Загружка файлов image.zip и coordinates.json, которые нужны для обучения и тестирования модели

In [3]:
from google.colab import files

uploaded = files.upload()

Saving coordinates.json to coordinates.json
Saving images.zip to images.zip


In [4]:
import zipfile
import os

# Убедитесь, что заменили 'your_uploaded_file.zip' на имя вашего загруженного файла.
zip_ref = zipfile.ZipFile('images.zip', 'r')

# Распаковка в корневую директорию сессии Colab
zip_ref.extractall('/')
zip_ref.close()

## Загрузка предобученной модели Faster R-CNN

In [5]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [00:01<00:00, 115MB/s]


## Пользовательский классификатор

In [6]:
num_classes = 2  # фон и наклейка
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

## Загрузка и предобработка данных

In [7]:
import torchvision.transforms as T
import torchvision.transforms.functional as F

def get_transform(train):
    transforms = []
    if train:
        # Пример добавления специфичных для обучения трансформаций
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)

def transform(image, target, transforms):
    image = transforms(image)
    # Если есть трансформации, влияющие на размер изображения, они должны быть применены к целям (targets) здесь
    return image, target

In [14]:
from PIL import Image
class CustomDataset():
    def __init__(self, images_folder, annotations_file, transforms=None):
        self.transforms = transforms
        self.images_folder = images_folder
        with open(annotations_file, 'r') as file:
            self.annotations = json.load(file)

        self.imgs = list(self.annotations.keys())

    def __getitem__(self, idx):
        img_name = self.imgs[idx]
        img_path = os.path.join(self.images_folder, img_name)
        image = Image.open(img_path).convert("RGB")

        # Получение одного bounding box из аннотаций
        box = self.annotations[img_name]
        box[1], box[3] = box[3], box[1]

        # Предполагаем, что каждый бокс - это список с четырьмя элементами
        boxes = torch.as_tensor([box], dtype=torch.float32)  # Преобразование в тензор [1, 4]

        # Всего один объект на изображении
        num_objs = 1
        labels = torch.ones((num_objs,), dtype=torch.int64)

        # Дополнительные поля для target
        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        target = {
            "boxes": boxes,
            "labels": labels,
            "image_id": image_id,
            "area": area,
            "iscrowd": iscrowd
        }

        if self.transforms:
            image, target = transform(image, target, self.transforms)

        return image, target

    def __len__(self):
        return len(self.imgs)




## Обучение модели

In [8]:
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
    model.train()
    for i, (images, targets) in enumerate(data_loader):
        # Убедимся, что images - это тензоры, а не объекты PIL.Image
        images = [image.to(device) if isinstance(image, torch.Tensor) else F.to_tensor(image).to(device) for image in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        optimizer.zero_grad()
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        losses.backward()
        optimizer.step()

        if i % print_freq == 0:
            print(f"Epoch: {epoch} [{i}/{len(data_loader)}] Loss: {losses.item()}")



In [9]:
@torch.no_grad()  # для уменьшения использования памяти
def evaluate(model, data_loader, device):
    # model.eval()
    total_loss = 0
    for images, targets in data_loader:
        images = [image.to(device) if isinstance(image, torch.Tensor) else F.to_tensor(image).to(device) for image in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        total_loss += sum(loss for loss in loss_dict.values())
    return total_loss / len(data_loader)


In [10]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)


In [11]:
def collate_fn(batch):
    return tuple(zip(*batch))


In [16]:
def train_model(model, data_loader, optimizer, device, num_epochs=1):
    for epoch in range(num_epochs):
        train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=5)
        optimizer.step()
        lr_scheduler.step()

        # оценка
        evaluate(model, data_loader, device=device)


In [17]:
from torch.utils.data import random_split
from torch.utils.data import DataLoader
import json

# Инициализация датасета, DataLoader и модели
dataset = CustomDataset(images_folder="/images", annotations_file="coordinates.json", transforms=get_transform(train=True))

# Определение размеров выборок
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

# Разделение датасета на обучающую и тестовую выборки
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, num_workers=2, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False, num_workers=2, collate_fn=collate_fn)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Инициализация оптимизатора
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

# Запуск обучения
train_model(model, train_loader, optimizer, device)


Epoch: 0 [0/46] Loss: 0.32082879543304443
Epoch: 0 [5/46] Loss: 0.20271755754947662
Epoch: 0 [10/46] Loss: 0.18786242604255676
Epoch: 0 [15/46] Loss: 0.2280522584915161
Epoch: 0 [20/46] Loss: 0.19985026121139526
Epoch: 0 [25/46] Loss: 0.20830775797367096
Epoch: 0 [35/46] Loss: 0.13413625955581665
Epoch: 0 [40/46] Loss: 0.21581299602985382
Epoch: 0 [45/46] Loss: 0.160865917801857


### Тестирование модели

In [18]:
loss = evaluate(model, test_loader, device=device)
print(f"Test Loss: {loss}")

Test Loss: 0.2019941359758377


## Сохранение модели

In [19]:
torch.save(model.state_dict(), 'fasterrcnn_resnet50_fpn.pth')

# Модель распознавания текста с наклеек: Tesseract OCR
 Является одним из наиболее широко используемых инструментов OCR, поддерживает множество языков и имеет хорошую точность распознавания текста.

In [83]:
!sudo apt install tesseract-ocr
!pip install pytesseract

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 24 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 1s (5,570 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debc

In [84]:
import pytesseract

# Функция для извлечения текста из области изображения
def extract_text_from_image(image, box):
    # Обрезка изображения по указанной области
    cropped_image = image.crop((box[0], box[1], box[2], box[3]))
    # Распознавание текста с помощью Tesseract OCR
    text = pytesseract.image_to_string(cropped_image, lang='eng')
    return text

## Извлечение текста из изображений исходя из полученной ранее разметки

In [22]:
# Загрузка обученной модели
model2 = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False, num_classes=2)
model2.load_state_dict(torch.load('fasterrcnn_resnet50_fpn.pth'))


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:01<00:00, 91.2MB/s]


<All keys matched successfully>

In [85]:
from torchvision.transforms import functional as F
from PIL import Image

def prepare_image(img_name):
    # Загрузка изображения
    img_path = os.path.join('/images', f"{int(img_name)}.png")
    image = Image.open(img_path).convert("RGB")

    # # Преобразование изображения в тензор
    # image = F.to_tensor(image)
    return image

In [57]:
@torch.no_grad()  # Отключение расчета градиентов
def get_predictions(model, data_loader):
    model.eval()  # Перевод модели в режим оценки
    predictions = []
    for images, targets in data_loader:
        images = [image.to(device) if isinstance(image, torch.Tensor) else F.to_tensor(image).to(device) for image in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        outputs = model(images, targets)

        outputs_boxes = [out['boxes'][0] for out in outputs]
        outputs_img_id = [t['image_id'] for t in targets]

        predictions.extend(zip(outputs_img_id, outputs_boxes))
    return predictions

# Получение предсказаний, используя test_loader
predictions = get_predictions(model2, test_loader)


In [70]:
predictions[0][1].tolist()

[143.58335876464844, 96.24446105957031, 383.00439453125, 232.49488830566406]

In [86]:
# Извлечение текста из каждой области
texts = [extract_text_from_image(prepare_image(img), target.tolist()) for img, target in predictions]

In [141]:
target_texts = ['\x0c']*len(predictions)
accuracy = 0
for text, target_text in zip(texts, target_texts):
  accuracy += 1 if text in ' \n\x0c' else 0

accuracy /= len(predictions)

In [142]:
print(accuracy)

0.9583333333333334
