In [1]:
import os

In [72]:
dataset_path = ""

def set_dataset_path(path: str):
    '''Sets the dataset path'''
    global dataset_path
    dataset_path = path

# set_dataset_path("../dataset/CCPD2019")
set_dataset_path("../dataset/CCPD2020/ccpd_green")

### Convert to Python Script

Run the function `convert_to_script()` to convert the notebook into a python script to use functions in other notebooks.

In [31]:
def convert_to_script():
    '''Turns the notebook into a python script to use functions in other notebooks'''
    !jupyter nbconvert --to script 01_data_exploration.ipynb

# convert_to_script()

[NbConvertApp] Converting notebook 01_data_exploration.ipynb to script
[NbConvertApp] Writing 12046 bytes to 01_data_exploration.py


## Extract Part of the Images

The full dataset is availabe on https://github.com/detectRecog/CCPD.

Because it includes over 300.000 images, we will start working with a smaller cut of the dataset at the beginning of this project, to avoid being slowed down by hardware limitations.


Thats why we will extract the first 5000 pictures from the **CCPD2019** dataset in `CCPD2019/ccpd_base/` in the first step.


In [4]:
import tarfile

def extract_images_from_tar_path(tar_path: str, output_dir: str, num_images_to_extract: int):
    '''Extracts a specified number of images from a tar.xz file to a specified output directory'''
    os.makedirs(output_dir, exist_ok=True)  # Create output directory if it doesn't exist

    # Limit the number of images to extract
    num_images_to_extract = 5000

    # Counter to track the number of extracted images
    extracted_count = 0

    with tarfile.open(tar_path, "r:xz") as tar:
        for member in tar:
            # Check if the file is in the desired folder and is an image file
            if member.name.startswith("CCPD2019/ccpd_base/") and member.isfile():
                # Extract the file
                tar.extract(member, path=output_dir)
                extracted_count += 1

                # Stop when we have extracted the required number of images
                if extracted_count >= num_images_to_extract:
                    break

    print(f"Successfully extracted {extracted_count} images to {output_dir}")

Extracted: CCPD2019/ccpd_base/0111242816092-89_90-265&500_443&565-449&556_277&578_277&515_449&493-0_0_27_33_33_23_26-144-12.jpg
Extracted: CCPD2019/ccpd_base/0239762931034-93_68-228&431_515&519-536&535_260&508_218&428_494&455-0_0_26_25_32_15_24-91-67.jpg
Extracted: CCPD2019/ccpd_base/0329274425288-98_72-207&491_488&617-508&623_213&574_197&486_492&535-0_0_19_23_26_27_27-142-100.jpg
Extracted: CCPD2019/ccpd_base/0322677203065-89_89-197&460_493&578-485&557_200&574_204&467_489&450-0_0_8_11_30_30_30-73-205.jpg
Extracted: CCPD2019/ccpd_base/0430675287357-113_54-244&460_474&652-482&662_245&547_237&450_474&565-0_0_24_29_20_27_29-71-54.jpg
Extracted: CCPD2019/ccpd_base/0207854406131-96_76-271&598_503&702-503&709_267&668_252&581_488&622-0_2_23_17_32_25_33-125-62.jpg
Extracted: CCPD2019/ccpd_base/0372629310345-101_71-217&423_495&567-521&581_218&511_200&408_503&478-0_0_7_30_31_27_31-152-71.jpg
Extracted: CCPD2019/ccpd_base/023917624521-95_77-219&515_456&622-457&621_222&590_210&508_445&539-0_0_30_2

Adjust the path to the .tar.xz file and the output directory to save the extracted images.
Then include the function call in the code.

```python 
extract_images_from_tar_path(tar_path, output_dir, num_images_to_extract)
```

In [None]:
# Path to the .tar.xz file
tar_path = "../../CCPD2019.tar.xz" # Change to your location if necessary
# Output directory to save the extracted images
output_dir = "../extracted_images"
# Limit the number of images to extract
num_images_to_extract = 5000

# extract_images_from_tar_path(tar_path, output_dir, num_images_to_extract)

## Split the Dataset into train/test/val

In [4]:
from sklearn.model_selection import train_test_split
import shutil

# Helper function to copy files
def copy_files(file_list: list[str], dest_folder: str):
    '''Copies a list of files to a specified destination folder'''
    for file_name in file_list:
        src_path = os.path.join(data_dir, file_name)
        dest_path = os.path.join(dest_folder, file_name)
        shutil.copy(src_path, dest_path)

def create_dataset_partitions(data_dir: str, output_dir: str):
    '''Creates a dataset partition into train, val, and test folders'''
    # Create output directories
    os.makedirs(f"{output_dir}/train/images", exist_ok=True)
    os.makedirs(f"{output_dir}/val/images", exist_ok=True)
    os.makedirs(f"{output_dir}/test/images", exist_ok=True)

    # Get a list of all image files in the directory
    image_files = [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))]

    # Split into train and test sets
    train_files, test_files = train_test_split(image_files, test_size=0.2, random_state=58)

    # Further split train into train and val sets
    test_files, val_files = train_test_split(test_files, test_size=0.5, random_state=58)

    # Copy files to their respective folders (use the copy_files function above)
    copy_files(train_files, f"{output_dir}/train/images")
    copy_files(val_files, f"{output_dir}/val/images")
    copy_files(test_files, f"{output_dir}/test/images")

    print(f"Train: {len(train_files)} images")
    print(f"Val: {len(val_files)} images")
    print(f"Test: {len(test_files)} images")

Train: 4000 images
Val: 500 images
Test: 500 images


Run `create_dataset_partitions(data_dir, output_dir)` to create the dataset partitions.
Adjust the path to the directory containing the images and the output directory if necessary.

In [None]:
# Path to the directory containing the images
data_dir = "../extracted_images/CCPD2019/ccpd_base"
# Path to the output directory
output_dir = dataset_path

create_dataset_partitions(data_dir, output_dir)

## Create Text File with Image Annotations and labels.csv

Create a text file `images.txt` with a list of all file names in the `extracted_images/CCPD2019/ccpd_base/` directory.

Each line of the `images.txt` file has the filename for exactly one image.
The filename includes all available labels for the image.

### A sample image name is:

`025-95_113-154&383_386&473-386&473_177&454_154&383_363&402-0_0_22_27_27_33_16-37-15.jpg`

Each name can be split into seven fields, divided by the character `-`.

025 **-** 95_113 **-** 154&383_386&473 **-** 386&473_177&454_154&383_363&402 **-** 0_0_22_27_27_33_16 **-** 37 **-** 15

1. **Area**: Area ratio of license plate area to the entire picture area.

   `Area = 025` (normalized value between 0 and 1)

2. **Tilt degree**: Horizontal tilt degree and vertical tilt degree.

   `Tilt degree = 95_113` (Horizontal and vertical tilt degree, 90 is normal)

3. **Bounding box coordinates**: The coordinates of the left-up and the right-bottom vertices.

   `Bounding box coordinates = 154&383_386&473`

4. **Four vertices locations**: The exact (x, y) coordinates of the four vertices of LP in the whole image. These coordinates start from the right-bottom vertex.
`Four vertices locations: = 386&473_177&454_154&383_363&402`

5. **License plate number**: Each image in CCPD has only one LP. Each LP number is comprised of a Chinese character, a letter, and five letters or numbers. A valid Chinese license plate consists of seven characters: province (1 character), alphabets (1 character), alphabets+digits (5 characters). "0_0_22_27_27_33_16" is the index of each character. These three arrays are defined as follows. The last character of each array is letter O rather than a digit 0. We use O as a sign of "no character" because there is no O in Chinese license plate characters.

	`License plate number = 0_0_22_27_27_33_16`

    	provinces = ["皖", "沪", "津", "渝", "冀", "晋", "蒙", "辽", "吉", "黑", "苏", "浙", "京", "闽", "赣", "鲁", "豫", "鄂", "湘", "粤", "桂", "琼", "川", "贵", "云", "藏", "陕", "甘", "青", "宁", "新", "警", "学", "O"]
	
        alphabets = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
             'X', 'Y', 'Z', 'O']

       ads = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
       'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'O']

   
       
7. **Brightness**: The brightness of the license plate region.

   `Brightness = 37`

8. **Blurriness**: The Blurriness of the license plate region.

   `Blurriness = 15`

### Create Text File with Image Annotations for each dataset partition

In [73]:
from os import listdir
from os.path import isfile, join

def create_images_txt(folder_name, path):
    '''Creates a text file with the image names for a specified dataset partition folder_name (train, val, test )'''
    path = f"{path}/{folder_name}"

    # Path to the image directory
    image_path = f"{path}/images"

    image_files = [f for f in listdir(image_path) if isfile(join(image_path, f))]

    with open(f"{path}/file_names.txt", "w") as text_file:
        for image in image_files:
            text_file.write(image + '\n')

    print(len(image_files))
    print(f"Successfully extracted {len(image_files)} image file names from {folder_name}/images to {folder_name}/file_names.txt.")

def create_all_txt_files_for_dataset(dataset_path):
    create_images_txt("test", dataset_path)
    create_images_txt("train", dataset_path)
    create_images_txt("val", dataset_path)

In [33]:
create_all_txt_files_for_dataset(dataset_path)

5006
Successfully extracted 5006 image file names from test/images to test/file_names.txt.
5769
Successfully extracted 5769 image file names from train/images to train/file_names.txt.
1001
Successfully extracted 1001 image file names from val/images to val/file_names.txt.


### Create Label CSV-Files

In [74]:
import csv

def turn_txt_to_csv(folder_name, path):
    '''Extracts the dataset labels from the image file_name and saves them in a csv file'''
    with open(f"{path}/{folder_name}/file_names.txt", "r") as file:
        lines = file.readlines()

    with open(f"{path}/{folder_name}/labels.csv", "w", newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['filename', 'area', 'tilt_degree', 'bounding_box_coordinates', 'four_vertices_locations', 'license_plate_number', 'brightness', 'blurriness'])
        for filename in lines:
            parts = filename.strip().replace(".jpg", "").split("-")
            csvwriter.writerow([filename.strip(), parts[0], parts[1], parts[2], parts[3], parts[4], parts[5], parts[6]])

def create_csv_files(dataset_path):
    turn_txt_to_csv("test", dataset_path)
    turn_txt_to_csv("train", dataset_path)
    turn_txt_to_csv("val", dataset_path)

In [70]:
create_csv_files(dataset_path)

## Sort out unsuitable images

In [80]:
import pandas as pd

def turn_string_to_float(string: str):
    '''Turns a string with leading zeros to a float: "025" -> 0.25'''
    lst = list(string)
    lst.insert(1, '.')
    return float(''.join(lst))

def unsuitable_area(area: str, area_threshold: float):
    '''Checks if the area is less than the threshold'''
    area = turn_string_to_float(area)
    return area < area_threshold

def unsuitable_tilt_degree(tilt_degree: str, deviation_threshold: int):
    '''Checks if the tilt degree is greater than the threshold'''
    tilt_degree = tilt_degree.split('_')

    horizontal_tilt_degree = int(tilt_degree[0])
    horizontal_deviation = abs(horizontal_tilt_degree - 90)

    vertical_tilt_degree = int(tilt_degree[1])
    vertical_deviation = abs(vertical_tilt_degree - 90)

    return horizontal_deviation > deviation_threshold or vertical_deviation > deviation_threshold

def unsuitable_brightness(brightness: str, brightness_threshold: int):
    '''Checks if the brightness is less than the threshold'''
    return int(brightness) < brightness_threshold

def unsuitable_blurriness(blurriness: str, blurriness_threshold: int):
    '''Checks if the blurriness is greater than the threshold'''
    return int(blurriness) < blurriness_threshold

def sort_out_unsuitable_images(dataset_path, folder_name, area_threshold = None, tilt_degree_deviation = None, brightness_threshold = None, blurriness_threshold = None):
    '''Removes images that do not meet the criteria'''
    count = 0

    # Read the labels.csv file and convert all columns to strings to avoid type errors
    labels = pd.read_csv(f"{dataset_path}/{folder_name}/labels.csv", dtype=str)

    for index, row in labels.iterrows():
        image_path = f"{dataset_path}/{folder_name}/images/{row['filename']}"

        if area_threshold is not None and unsuitable_area(row['area'], area_threshold):
            # Remove image if the area is less than the threshold
            count += 1
            os.remove(image_path)
        elif tilt_degree_deviation is not None and unsuitable_tilt_degree(row['tilt_degree'], tilt_degree_deviation):
            # Remove image if the tilt degree is greater than the threshold
            count += 1
            os.remove(image_path)
        elif brightness_threshold is not None and unsuitable_brightness(row['brightness'], brightness_threshold):
            # Remove image if the brightness is less than the threshold
            count += 1
            os.remove(image_path)
        elif blurriness_threshold is not None and unsuitable_blurriness(row['blurriness'], blurriness_threshold):
            # Remove image if the blurriness is greater than the threshold
            count += 1
            os.remove(image_path)

    # Update txt file
    create_images_txt(folder_name, dataset_path)

    # Update csv file
    turn_txt_to_csv(folder_name, dataset_path)

    print(f"Removed {count} images with unsuitable area, tilt degree, brightness, or blurriness")

Run the function `sort_out_unsuitable_images(dataset_path, "train", area_threshold=0.16, tilt_degree_deviation=10, brightness_threshold=20)` to sort out unsuitable images.

Adjust the path to the dataset and the folder name if necessary.

Unsuitable images are removed from the dataset.

The function `sort_out_unsuitable_images()` also updates the `file_names.txt` and `labels.csv` files.

In [81]:
sort_out_unsuitable_images(dataset_path, "train", tilt_degree_deviation=10, brightness_threshold=70, blurriness_threshold=100) # area_threshold=0.165

962
Successfully extracted 962 image file names from train/images to train/file_names.txt.
Removed 700 images with unsuitable area, tilt degree, brightness, or blurriness


### Calculate Ratio of License Plates in the Dataset: 

4. **Four vertices locations**: The exact (x, y) coordinates of the four vertices of LP in the whole image. These coordinates start from the right-bottom vertex.
`Four vertices locations: = 386&473_177&454_154&383_363&402`
`Four vertices locations: = oben-rechts_oben-links_unten-links_unten-rechts`

In [77]:
import math
import pandas as pd
import numpy as np

def ratio(a, b, c):
    xa = int(a[0])
    ya = int(a[1])
    xb = int(b[0])
    yb = int(b[1])
    xc = int(c[0])
    yc = int(c[1])

    width = math.sqrt((xa - xb) ** 2 + (ya - yb) ** 2)
    height = math.sqrt((xc - xb) ** 2 + (yc - yb) ** 2)

    return width / height

def one_folder(full_path, folder_name):
    ratios = []
    labels = pd.read_csv(f"{full_path}/{folder_name}/labels.csv", dtype=str)

    for index, row in labels.iterrows():
        vertices = row['four_vertices_locations']
        vertices = vertices.split('_')
        vertices = [vertex.split('&') for vertex in vertices]
        r = ratio(vertices[0], vertices[1], vertices[2])
        ratios.append(r)

    return ratios

def calculate_ratio_of_license_plates(full_path, folder_names):
    ratios = []

    for folder in folder_names:
        ratios.extend(one_folder(full_path, folder))

    print(len(ratios))

    data = np.array(ratios)

    average_ratio = np.mean(data)
    std = np.std(data)

    print(f"Average ratio of license plates: {average_ratio}")
    print(f"Standard deviation of license plates: {std}")
    return average_ratio


In [66]:
calculate_ratio_of_license_plates("../dataset/CCPD2020/ccpd_green/", ["train", "val", "test"])

11776
Average ratio of license plates: 3.4691643983855824
Standard deviation of license plates: 0.5240157580842137


np.float64(3.4691643983855824)