# 1. 先載入預訓練好的YOLOv4，對大圖("test"資料夾)做出result.json，再做出小圖("little test"資料夾)
# 2. 接著載入預訓練好的TrOCR模型做text generation(要求檔名順序與生成的text對應)

In [1]:
! nvidia-smi

Mon Apr 18 13:57:14 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
! git clone https://github.com/AlexeyAB/darknet.git

Cloning into 'darknet'...
remote: Enumerating objects: 15412, done.[K
remote: Total 15412 (delta 0), reused 0 (delta 0), pack-reused 15412[K
Receiving objects: 100% (15412/15412), 14.04 MiB | 14.69 MiB/s, done.
Resolving deltas: 100% (10356/10356), done.


In [3]:
# modify Makefile
! sed -i 's/GPU=0/GPU=1/' /content/darknet/Makefile
! sed -i 's/CUDNN=0/CUDNN=1/' /content/darknet/Makefile
! sed -i 's/OPENCV=0/OPENCV=1/' /content/darknet/Makefile
! sed -i 's/CUDNN_HALF=0/CUDNN_HALF=1/' /content/darknet/Makefile

In [4]:
# compile darknet
! cd darknet; make

In [5]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [6]:
# load test data
! unzip /content/gdrive/MyDrive/Tbrain/new_public_testing_data.zip > new_testing_data_unzip.log

In [7]:
# some helper function
import cv2
from tqdm import trange
from random import shuffle
from os import listdir
from os import rename as rename_mv
from pandas import set_option, read_csv

set_option('display.max_columns', None)


def create_predict_imgs_txt(test_data_path: str, output_path: str):
    data_ls = [test_data_path + p for p in listdir(test_data_path)]
    with open(f'{output_path}predict_imgs.txt', 'w') as f:
        f.write('\n'.join(data_ls))


def generate_label_file_match_each_img(data_path: str, name_path: str, output_path: str):
    """ name_path = '/content/data_description/public_training_data.csv',
        output_path = '/content/my_yolo/' """
    # # after check_each_img_shape(),
    # # public_training/testing_data裡的圖片維度都是 height, width, channels = (1028, 1232, 3)
    # # width, height, channels = img.shape (X)
    # # height, width, channels = img.shape (O)
    height, width = 1028, 1232
    info_df = read_csv(name_path)
    with trange(len(info_df)) as t:
        for index in t:
            df = info_df.iloc[index, :]
            x_min, x_max = min(df['top left x'], df['bottom left x']), max(df['top right x'], df['bottom right x'])
            y_min, y_max = min(df['top left y'], df['top right y']), max(df['bottom right y'], df['bottom right y'])
            x = (x_min + (x_max - x_min) / 2) * 1.0 / width
            y = (y_min + (y_max - y_min) / 2) * 1.0 / height
            w = (x_max - x_min) * 1.0 / width
            h = (y_max - y_min) * 1.0 / height
            # rename data.jpg and move to /my_yolo/
            prev_name = f"{data_path}{df['filename']}.jpg"
            new_name = f'{output_path}img{index}'
            rename_mv(prev_name, f'{new_name}.jpg')
            # create a data.txt in /my_yolo/
            f = open(file=f'{new_name}.txt', mode='w')
            f.write(f'0 {x} {y} {w} {h}')
            f.close()
            t.set_description(f'Progress {index + 1}')


def generate_train_test_txt(output_path: str):
    """ output_path = '/content/' """
    # train_img0.jpg~train_img12066.jpg, total: 12067 img files
    data_ls = [f'/content/my_yolo/train_img{i}.jpg' for i in range(12067)]
    shuffle(data_ls)
    split_index = int(len(data_ls) * 0.8)
    train_ls = data_ls[: split_index]
    test_ls = data_ls[split_index:]
    with open(f'{output_path}train.txt', 'w') as f1, open(f'{output_path}test.txt', 'w') as f2:
        f1.write('\n'.join(train_ls))
        f2.write('\n'.join(test_ls))


def check_each_img_shape(data_path: str, name_path: str):
    """ name_path = '/content/data_description/public_training_data.csv',
        data_path = '/content/public_training_data/' """
    info_df = read_csv(name_path)
    for index, name in enumerate(info_df['filename']):
        image_array = cv2.imread(data_path + name + '.jpg')
        print(f'progress: {index}')
        assert image_array.shape == (1028, 1232, 3)  # width, height, channels


def check_training_coordinate(data_path: str, name_path: str):
    """ data_path='/content/public_training_data/',
        name_path='/content/data_description/public_training_data.csv' """
    # 從slice by coordinate 學到: image array 是(y, x, channels) = (height, width, channels)
    # 先前只知道索引值(0, 0)在左上角
    # (0, 0)            (1232, 0)
    # -------------------------
    # |  x+                   |
    # | y                     |
    # | +                     |
    # |                       |
    # |                       |
    # |                       |
    # -------------------------
    # (0, 1028)         (1232, 1028)
    info_df = read_csv(name_path)
    for index in range(len(info_df)):  # range(len(info_df)) [0, 1, 2]
        df = info_df.iloc[index, :]
        x_min, x_max = min(df['top left x'], df['bottom left x']), max(df['top right x'], df['bottom right x'])
        y_min, y_max = min(df['top left y'], df['top right y']), max(df['bottom right y'], df['bottom right y'])
        image_array = cv2.imread(data_path + df['filename'] + '.jpg')
        slice_by_coordinate = image_array[int(y_min): int(y_max), int(x_min): int(x_max), :]
        cv2.imshow('img0', image_array)
        cv2.imshow('img1', slice_by_coordinate)
        cv2.waitKey(2000)  # pauses for 2 seconds before fetching next image.
        print(f"{index}_{df['filename']}")
        # cv2.destroyWindow('img0')
        # cv2.destroyWindow('img1')

In [8]:
! rm -rf /content/my_yolo_cfg/
! mkdir /content/my_yolo_cfg/

In [9]:
# 從gdrive 取出 config file 放置於 /content/my_yolo_cfg/
! cp /content/gdrive/MyDrive/Tbrain/my_yolo_cfg/my_obj.* /content/my_yolo_cfg/

In [10]:
! head /content/my_yolo_cfg/my_obj.data

classes= 1
train  = /content/my_yolo_cfg/train.txt
valid  = /content/my_yolo_cfg/test.txt
names = /content/my_yolo_cfg/my_obj.names
backup = /content/my_yolo_cfg/my_weights/

In [11]:
! head /content/my_yolo_cfg/my_obj.names

target

In [12]:
! mkdir /content/my_yolo_cfg/my_weights/

In [13]:
# download per-training weights from gdrive to /content/my_yolo_cfg/
! cp /content/gdrive/MyDrive/Tbrain/yolov4-custom_3000.weights /content/my_yolo_cfg/my_weights/

In [14]:
# 將原有的參數宣告(.cfg file)從darknet複製到 /content/my_yolo_cfg/。
! cp /content/darknet/cfg/yolov4-custom.cfg /content/my_yolo_cfg/

In [15]:
# create predict_imgs.txt for detector test generate coordinate prediction
create_predict_imgs_txt(test_data_path='/content/new_public_testing_data/', output_path='/content/')

In [16]:
# train時： 根據GitHub說明，修改參數宣告。
# ! sed -i '6s/64/16/' /content/my_yolo_cfg/yolov4-custom.cfg  # batch
! sed -i '7s/16/64/' /content/my_yolo_cfg/yolov4-custom.cfg  # mini_batch = batch/subdivisions, if out of memory, try to increase this value.
! sed -i '8s/608/832/' /content/my_yolo_cfg/yolov4-custom.cfg  #  network size width, increase network resolution, it will increase precision.
! sed -i '9s/608/832/' /content/my_yolo_cfg/yolov4-custom.cfg  #  network size height, increase network resolution, it will increase precision.
! sed -i '20s/500500/8000/' /content/my_yolo_cfg/yolov4-custom.cfg  # max_batches means how many epochs in training step.
! sed -i '22s/400000,450000/6400,7200/' /content/my_yolo_cfg/yolov4-custom.cfg  # steps
! sed -i '970s/80/1/' /content/my_yolo_cfg/yolov4-custom.cfg  # classes
! sed -i '1058s/80/1/' /content/my_yolo_cfg/yolov4-custom.cfg  # classes
! sed -i '1146s/80/1/' /content/my_yolo_cfg/yolov4-custom.cfg  # classes
! sed -i '963s/255/18/' /content/my_yolo_cfg/yolov4-custom.cfg  # filters
! sed -i '1051s/255/18/' /content/my_yolo_cfg/yolov4-custom.cfg  # filters
! sed -i '1139s/255/18/' /content/my_yolo_cfg/yolov4-custom.cfg  # filters
# To make the detected bounded boxes more accurate(higher IoU), we add 3 parameters
# ignore_thresh=.9 iou_normalizer=0.5 iou_loss=giou to each [yolo] layer and train.
! sed -i '973s/.7/.9/' /content/my_yolo_cfg/yolov4-custom.cfg  # ignore_thresh
! sed -i '978s/0.07/0.5/' /content/my_yolo_cfg/yolov4-custom.cfg  # iou_normalizer
! sed -i '979s/ciou/giou/' /content/my_yolo_cfg/yolov4-custom.cfg  # iou_loss
! sed -i '1061s/.7/.9/' /content/my_yolo_cfg/yolov4-custom.cfg  # ignore_thresh
! sed -i '1066s/0.07/0.5/' /content/my_yolo_cfg/yolov4-custom.cfg  # iou_normalizer
! sed -i '1067s/ciou/giou/' /content/my_yolo_cfg/yolov4-custom.cfg  # iou_loss
! sed -i '1149s/.7/.9/' /content/my_yolo_cfg/yolov4-custom.cfg  # ignore_thresh
! sed -i '1155s/0.07/0.5/' /content/my_yolo_cfg/yolov4-custom.cfg  # iou_normalizer
! sed -i '1156s/ciou/giou/' /content/my_yolo_cfg/yolov4-custom.cfg  # iou_loss

# test時： custom.cfg的內容batch、subdivision都改為1，其餘皆不變(與train時相同)。
! sed -i '6s/64/1/' /content/my_yolo_cfg/yolov4-custom.cfg  # batch
! sed -i '7s/64/1/' /content/my_yolo_cfg/yolov4-custom.cfg  # mini_batch = batch/subdivisions, if out of memory, try to increase this value.

In [17]:
# 查看yolov4-custom.cfg第6、7、8、9、20、22行的設定。
! sed -n -e 6p -e 7p -e 8p -e 9p -e 20p -e 22p /content/my_yolo_cfg/yolov4-custom.cfg

batch=1
subdivisions=1
width=832
height=832
max_batches = 8000
steps=6400,7200


# 於此生成 new_test_result.json後，藉此去生成小圖(little)

In [18]:
# predict multiple images and output .json
! darknet/darknet detector test /content/my_yolo_cfg/my_obj.data /content/my_yolo_cfg/yolov4-custom.cfg /content/my_yolo_cfg/my_weights/yolov4-custom_3000.weights -dont_show -ext_output -out new_test_result.json < /content/predict_imgs.txt

# 生成小圖後做文字辨識

In [19]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [20]:
! pip3 install transformers==4.12.0
! cp /content/gdrive/MyDrive/Tbrain/data_description/submission_template.csv ./
! unzip /content/gdrive/MyDrive/Tbrain/little_new_public_testing_data_with_artificial.zip > little_new_testing_data_unzip.log

Collecting transformers==4.12.0
  Downloading transformers-4.12.0-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 8.6 MB/s 
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.9 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 43.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 50.4 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 69.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  A

In [21]:
# download model from gdrive to colab
! cp -r /content/gdrive/MyDrive/Tbrain/seq2seq/my-model-epoch54_processor ./
! unzip /content/gdrive/MyDrive/Tbrain/seq2seq/my-model-epoch54.zip > my-model-epoch54.log

In [22]:
import os
import torch
from tqdm.auto import tqdm
from pandas import read_csv, DataFrame
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from transformers import VisionEncoderDecoderConfig, TrOCRProcessor, VisionEncoderDecoderModel


class TestDataset(Dataset):
    def __init__(self, images_dir, csv_file):
        self.images_dir = images_dir
        self.df = read_csv(csv_file)
        # get image columns
        self.images = self.df['id'].tolist()

    def __len__(self):
        return len(self.images)

    def __getitem__(self, index):
        image_name = self.images[index] + '.jpg'
        image = Image.open(os.path.join(self.images_dir, image_name)).convert('RGB')
        return image


class TestCollate:
    """ To handle batch text input padding problem """
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        images = [item for item in batch]
        images = self.tokenizer(images, return_tensors="pt").pixel_values
        return images

In [23]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
my_processor = TrOCRProcessor.from_pretrained('/content/my-model-epoch54_processor')
encoder_decoder_config = VisionEncoderDecoderConfig.from_pretrained('/content/my-model-epoch54')
my_model = VisionEncoderDecoderModel.from_pretrained('/content/my-model-epoch54', config=encoder_decoder_config).to(device)

test_ds = TestDataset(images_dir='/content/little_new_public_testing_data/',
            csv_file='/content/submission_template.csv')
test_loader = DataLoader(dataset=test_ds, batch_size=100, shuffle=False, 
            collate_fn=TestCollate(tokenizer=my_processor))

res_ls = []
with tqdm(test_loader, unit='batch') as tepoch:
    for pixel_values in tepoch:
        generated_ids = my_model.generate(pixel_values.to(device))
        generated_text = my_processor.batch_decode(generated_ids, skip_special_tokens=True)
        generated_text = [i_str.replace(' ', '') for i_str in generated_text]
        res_ls += generated_text
        tepoch.set_postfix(mode='test')


id_df = read_csv('/content/submission_template.csv')
res_d = {'id': id_df['id'].tolist(), 'text': res_ls}
res_df = DataFrame(res_d)
res_df.to_csv('tbrain_result.csv', index=False)

  100%|██████████████| 60/60 [25:32<00:00, 25.58s/batch, mode=test]