# Install packages

In [None]:
!sudo apt install tesseract-ocr
!pip install pytesseract==0.3.8
!pip install transformers==4.11.3

# Imports

In [None]:
from __future__ import print_function, division
import torch
import torchvision
from tqdm import tqdm, tqdm_notebook
from os.path import exists
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import matplotlib.pyplot as plt
from torchvision import datasets, models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import time
import os
import copy
import torch.nn.functional as F
from torch.autograd import Variable
import torch.onnx
import cv2
import seaborn as sns
import pytesseract
import pandas as pd
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
%matplotlib inline
from google.colab import drive
drive.mount('/content/gdrive')
plt.ion() 

# CUDA status

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device, torch.cuda.is_available()

In [None]:
# Более новая версия торча не совместима с ГПУ на видеокартах Colab
# https://pytorch.org/ 10.1
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'
print("\ndevice: ", device, "\nPyTorch Version: ", torch.__version__, "\nTorchvision Version: ", torchvision.__version__, \
    "\nПроверяем, доступны ли GPU: ", torch.cuda.is_available(), "\naccelerator: ", accelerator)
if torch.cuda.is_available() == False:
    !pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html

# Проверяем, доступны ли GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print("\ndevice: ", device, "\nPyTorch Version: ", torch.__version__, "\nTorchvision Version: ", torchvision.__version__, \
    "\nПроверяем, доступны ли GPU: ", torch.cuda.is_available(), "\naccelerator: ", accelerator)

In [None]:
print(torch.__config__.show()) 
print(torch.version.cuda)
torch.cuda.is_available()

# 0) Work with data

In [None]:
data_dir="/content/gdrive/MyDrive/data/dataset"

In [None]:
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(540),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
    ]),
    'test': transforms.Compose([
        transforms.RandomResizedCrop(540),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
    ]),
}

In [None]:
def get_dataset(data_dir, data_transforms, folders=['train', 'test']):
    image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                              data_transforms[x])
                      for x in folders}
    dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
                                                 shuffle=True, num_workers=4)
                  for x in folders}
    dataset_sizes = {x: len(image_datasets[x]) for x in folders}
    classes = image_datasets['train'].classes

    return dataloaders["train"], dataloaders['test'], classes, dataset_sizes

In [None]:
trainloader, testloader, classes, dataset_sizes=get_dataset(data_dir,data_transforms, folders=['train', 'test'])
print('Classes: ',  classes)
print('The datasest have: ',  dataset_sizes ," images")

In [None]:
# TODO: CHECK THAT SPLIT ARE VALID

In [None]:
def imshow(img):
    img = img / 2+0.5      
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()
dataiter = iter(trainloader)
images, labels = next(dataiter)
imshow(torchvision.utils.make_grid(images))
print('|'.join('%10s' % classes[labels[j]] for j in range(4)))

# 1) Image classification

In [None]:
def fit_epoch(model, train_loader, criterion, optimizer):
    running_loss = 0.0
    running_corrects = 0
    processed_data = 0
  
    for inputs, labels in train_loader:
        inputs = inputs.to(DEVICE)
        labels = labels.to(DEVICE)
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        preds = torch.argmax(outputs, 1)
        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)
        processed_data += inputs.size(0)
              
    train_loss = running_loss / processed_data
    train_acc = running_corrects.cpu().numpy() / processed_data
    return train_loss, train_acc

In [None]:
def eval_epoch(model, val_loader, criterion):
    model.eval()
    running_loss = 0.0
    running_corrects = 0
    processed_size = 0

    for inputs, labels in val_loader:
        i = 0
        inputs = inputs.to(DEVICE)
        labels = labels.to(DEVICE)

        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            preds = torch.argmax(outputs, 1)

        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)
        if (preds[i] != labels.data[i]):
          print(preds[i], labels.data[i])
        i+=1
        processed_size += inputs.size(0)
    val_loss = running_loss / processed_size
    val_acc = running_corrects.double() / processed_size
    return val_loss, val_acc

In [None]:
def train(train_loader, val_loader, model, criterion, epochs, batch_size,optimizer, scheduler, sampler = None, shuffle = True):

    history = []
    log_template = "\nEpoch {ep:03d} train_loss: {t_loss:0.4f} \
    val_loss {v_loss:0.4f} train_acc {t_acc:0.4f} val_acc {v_acc:0.4f}"

    with tqdm(desc="epoch", total=epochs) as pbar_outer:
        for epoch in range(epochs):
            train_loss, train_acc = fit_epoch(model, train_loader, criterion, optimizer)
            print("loss", train_loss)
            
            val_loss, val_acc = eval_epoch(model, val_loader, criterion)
            history.append((train_loss, train_acc, val_loss, val_acc))
            scheduler.step()
            pbar_outer.update(1)
            tqdm.write(log_template.format(ep=epoch+1, t_loss=train_loss,\
                                           v_loss=val_loss, t_acc=train_acc, v_acc=val_acc))
            
    return history

In [None]:
def predict(model, test_loader):
    with torch.no_grad():
        logits = []
    
        for inputs in test_loader:
            inputs = inputs.to(DEVICE)
            model.eval()
            outputs = model(inputs).cpu()
            logits.append(outputs)
            
    probs = nn.functional.softmax(torch.cat(logits), dim=-1).numpy()
    return probs

In [None]:
myModel = models.googlenet(pretrained=True)

In [None]:
%%timeit
myModel(images[:1])

In [None]:
n_classes = 9
for param in myModel.parameters():
  param.requires_grad = False
DEVICE = torch.device("cuda")
numFeat = myModel.fc.in_features
myModel.fc = nn.Linear(numFeat, n_classes)
myModel = myModel.to(DEVICE)
criterizator = nn.CrossEntropyLoss()
optimizator = torch.optim.AdamW(myModel.parameters())
shedulator = torch.optim.lr_scheduler.StepLR(optimizator,3,0.5)

In [None]:
%%time

history = train(trainloader, testloader, model=myModel, criterion = criterizator, epochs=10, batch_size=40,optimizer = optimizator,scheduler = shedulator)

for param in myModel.parameters():
  param.requires_grad = True
history = train(trainloader, testloader, model=myModel, criterion = criterizator, epochs=24, batch_size=40,optimizer = optimizator,scheduler = shedulator)

# 2) Image to string

In [None]:
img_files = []
for path, subdirs, files in os.walk(data_dir):
    for name in files:
      img_files.append(os.path.join(path, name))

In [None]:
def image_to_string(img_filepath):

  img_cv = cv2.imread(img_filepath)
  img_rgb = cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB)
  return pytesseract.image_to_string(img_rgb)

In [None]:
%%time

df_dict = {
    'filepath': [],
    'text': [],
}

for img in tqdm(img_files):
  text = image_to_string(img)
  df_dict['filepath'].append(img)
  df_dict['text'].append(text)

df_raw = pd.DataFrame(df_dict)

In [None]:
df_raw['type'] = df_raw['filepath'].str.split(data_dir).str[1].str.split('/').str[1]
df_raw['label'] = df_raw['filepath'].str.split(data_dir).str[1].str.split('/').str[2]
df_raw['label'] = df_raw['label'].str.split(' ').str[1].astype(int) - 1

In [None]:
mask = df_raw['type'] == 'train'
df_train = df_raw[mask]
df_test = df_raw[~mask]
df_train.to_csv(os.path.join(data_dir, 'df_train.csv'))
df_test.to_csv(os.path.join(data_dir, 'df_test.csv'))

In [None]:
df_train = pd.read_csv(os.path.join(data_dir, 'df_train.csv'))
df_test = pd.read_csv(os.path.join(data_dir, 'df_test.csv'))

# 3) BERT model

In [None]:
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

In [None]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
token_lens = []
for txt in df_train.text:
  tokens = tokenizer.encode(txt, max_length=512)
  token_lens.append(len(tokens))

In [None]:
sns.distplot(token_lens)
# plt.xlim([0, 256]);
plt.xlabel('Token count');

In [None]:
MAX_LEN = max(token_lens)

In [None]:
class TextDataset(Dataset):
  def __init__(self, df, tokenizer, max_len):
    self.df = df
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.df)

  def __getitem__(self, item):
    text = str(self.df.loc[item, 'text'])
    label = self.df.loc[item, 'label']
    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'labels': torch.tensor(label, dtype=torch.long)
    }

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = TextDataset(
    df=df,
    tokenizer=tokenizer,
    max_len=max_len
  )
  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

In [None]:
BATCH_SIZE = 4
trainloader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
testloader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

  cpuset_checked))


In [None]:
data = next(iter(trainloader))
data.keys()

In [None]:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['labels'].shape)

In [None]:
class TextClassifier(nn.Module):
  
  def __init__(self, n_classes):
    super(TextClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    bert_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(bert_output['pooler_output'])
    return self.out(output)

In [None]:
n_classes = 9
model = TextClassifier(n_classes)
model = model.to(device)

In [None]:
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)
print(input_ids.shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length

In [None]:
EPOCHS = 10
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):
  model = model.train()
  losses = []
  correct_predictions = 0
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    labels = d["labels"].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)
    correct_predictions += torch.sum(preds == labels)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      labels = d["labels"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, labels)
      correct_predictions += torch.sum(preds == labels)
      losses.append(loss.item())
  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
from collections import defaultdict

In [None]:
%%time
history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)
  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(df_train)
  )
  print(f'Train loss {train_loss} accuracy {train_acc}')
  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn,
    device,
    len(df_val)
  )
  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()
  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)
  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc