# OpenALPR benchmark dataset preprocess

In [1]:
!mkdir train
!mkdir val

In [2]:
import shutil
from glob import glob
from numpy.random import shuffle
import os
import csv

## Reset: Remove all files from train and test folder

In [3]:
train_list = glob("train/*.png")
test_list = glob("val/*.png")
file_list = train_list + test_list

for f in file_list:
    os.remove(f)

## Remove images with incorrect labels

In [4]:
def label_reader(file_path):
    true_labels = {}
    
    with open(file_path) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            true_labels[row[0]] = row[1]
            
    return true_labels
                
true_labels = label_reader('groundtruth.csv')
len(true_labels)

751

In [5]:
img_list = glob('usimages/*.png')
img_list[:5]

['usimages/wa1071.png',
 'usimages/ny1237.png',
 'usimages/nj891.png',
 'usimages/nv1583.png',
 'usimages/ky729.png']

In [6]:
for img in img_list:
    img_id = img.split('/')[-1]
    if true_labels[img_id] == '0':
        os.remove(img)

## Do train-test split

In [7]:
img_list = glob('usimages/*.png')
shuffle(img_list)
len(img_list)

746

In [8]:
# Define ratio of test and train
test_len = 146
train_len = 600
train_len, test_len

(600, 146)

In [9]:
for i in range(train_len):
    img_path = img_list[i]
    img_id = img_path.split('/')[-1]
    shutil.copyfile(img_list[i], 'train/' + img_id)

In [10]:
for i in range(train_len, train_len + test_len):
    img_path = img_list[i]
    img_id = img_path.split('/')[-1]
    shutil.copyfile(img_list[i], 'val/' + img_id)

## Create label txt 

In [11]:
!rm -f train_list.txt
!rm -f val_list.txt

In [12]:
train_img = glob('train/*.png')
test_img = glob('val/*.png')
len(train_img), len(test_img)

(600, 146)

In [13]:
train_lines = []
for t in train_img:
    img_id = t.split('/')[-1]
    train_lines.append(t + "\t" + true_labels[img_id])
    
train_lines[:5]

['train/wa1071.png\tPRIVACY',
 'train/ny1237.png\tACX7518',
 'train/nj891.png\tNZV97Y',
 'train/ky729.png\t547KPE',
 'train/wy712.png\t4779']

In [14]:
with open('train_list.txt', 'w') as f:
    for line in train_lines:
        f.write(line)
        f.write('\n')

In [15]:
test_lines = []
for t in test_img:
    img_id = t.split('/')[-1]
    test_lines.append(t + "\t" + true_labels[img_id])
    
test_lines[:5]

['val/nv1583.png\t326NEB',
 'val/in1658.png\tN0G0DS',
 'val/mi242.png\tG0LFING',
 'val/nv756.png\t839VGA',
 'val/va341.png\t1333LE']

In [16]:
with open('val_list.txt', 'w') as f:
    for line in test_lines:
        f.write(line)
        f.write('\n')