### Подготовка данных.

Libraries

In [41]:
# for data
import os
import torch
import glob

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

from PIL import Image
from sklearn.model_selection import train_test_split

# for training
import torch.nn as nn

Static configs

In [28]:
PATH_2_SAMPELS = 'samples/*'
TRAIN_DIR = 'data/train'
TEST_DIR = 'data/test'

BATCH_SIZE = 8

ALPHABETH = 'abcdefghijklmnopqrstuvwxyz'
DIGITS = '0123456789'
CHARS = ALPHABETH + DIGITS
VOCAB_SIZE = len(CHARS) + 1

lr = 0.01
weight_decay = 1e-3
momentum = 0.9

EPOCHS = 10

In [39]:
class CaptchDataset(Dataset):
    """
    class for getting data and labels
    """
    def __init__(self, pth):
        """
        collect data
        """
        pth_list = os.listdir(pth)
        abs_pth = os.path.abspath(pth)
        self.imgs = [os.path.join(abs_pth, p) for p in abs_pth]
        self.transform = transforms.Compose([
          transforms.ToTensor()  
        ])
    
    def __len__(self):
        """
        get length of the data
        """
        return len(self.imgs)
    
    def __getitter__(seld, idx):
        """
        get tensor of the image and label by index
        """
        pth = self.img_list[idx]
        label = os.path.basename(pth).split('.')[0].lower()
        img = Image.open(pth).convert('RGB')
        img_tensor = self.transform(img)
        return img_tensor, label
      

In [40]:
# prepare folders
os.system(f'mkdir data')
os.system(f"rm -rf data/train")
os.system(f"rm -rf data/test")
[os.system(f'mkdir data/{directory}') for directory in ('train', 'test')]

# split data for train and test 1/4
train_set, test_set = train_test_split(glob.glob(PATH_2_SAMPELS), test_size=0.1)
train_dataset = CaptchDataset(TRAIN_DIR)
val_dataset = CaptchDataset(TEST_DIR)
[os.system(f"cp {te} {VAL_DIR}/{te.split('/')[1]}") for te in test_set]
[os.system(f"cp {tr} {TRAIN_DIR}/{tr.split('/')[1]}") for tr in train_set]

# loading data
train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE,
    shuffle=True
)
val_loader = DataLoader(
    val_dataset, batch_size=BATCH_SIZE,
    shuffle=False
)


mkdir: data: File exists


### Создание и обучение модели.

CRNN class with layers

In [48]:
class CRNN(nn.Module):
    def __init__(self, vocab_size, dropout=0.5):
        """
        initiate layers
        """
        super(CRNN, self).__init__()
        # probability of an element to be zeroed (Bernoulli distribution)
        self.dropout = nn.Dropout(p=dropout)
        self.clayer = nn.Sequential(
            # With square kernels and equal stride and with padding
            nn.Conv2d(
                in_channels=3,
                out_channels=32, (3,3),
                stride=1,
                padding=1
            ),
            # activation max(0,x)
            nn.ReLU(), 
            nn.MaxPool2d(
                kernel_size=(2,2),
                stride=2
            ),
            # in_channels, out_channels, kernel_size
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=(3,3),
                stride=1, 
                padding=1),
            nn.ReLU(),
            # pool of square window of size=2, stride=2
            nn.MaxPool2d(
                kernel_size=(2,2), 
                stride=2
            ),
            
            nn.Conv2d(
                in_channels=64,
                out_channels=128,
                kernel_size=(3,3), 
                stride=1, 
                padding=1
            ),
            nn.ReLU(), # max(0,x)
            
            nn.Conv2d(
                in_channels=128,
                out_channels=256, (3,3), 
                stride=1, 
                padding=1
            ),
            nn.ReLU(),
            nn.MaxPool2d(
                kernel_size=(1,2), 
                stride=2
            ),
            
            nn.Conv2d(
                in_channels=256,
                out_channels=512,
                kernel_size=(3,3), 
                stride=1, 
                padding=1
            ),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            
            nn.Conv2d(
                in_channels=512,
                out_channels=512,
                kernel_size=(3,3), 
                stride=1, 
                padding=1
            ),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(
                kernel_size=(1, 2), 
                stride=2),
            
            nn.Conv2d(
                in_channels=512,
                out_channels=512,
                kernel_size=(2,2), 
                stride=1, 
                padding=1
            ),
            self.dropout
        )
        self.seq = nn.Sequential(
            nn.Linear(
                in_features=1024, 
                out_features=256
            ),
            self.dropout
        )
        # RNN to an input sequence
        self.rnn_layer = nn.GRU(
            input_size=256,
            hidden_size=256, 
            num_layers=2, 
            bidirectional=True
        )
        # Linear transformation to the incoming data
        self.outlayer = nn.Linear(
            in_features=512, 
            out_features=vocab_size
        )

        
    def forvard(self, x):
        """
        method for CNN and RNN combining 
        """
        x = self.clayer(x) # CNN apply
        x = x.permute(0, 3, 1, 2) # transformate dimension
        x = x.view(x.size(0), x.size(1), -1) # choose field
        x = self.seq(x) # apply linear with dropout
        x, _ = self.rnn_layer(x) # apply rnn layers
        x = self.outlayer(x) # apply linear layer
        return x.permute(1, 0, 2) # transformate dimension
        