# Custom OCR using YOLO and tesseract
- https://medium.com/saarthi-ai/how-to-build-your-own-ocr-a5bb91b622ba
- https://www.arunponnusamy.com/preparing-custom-dataset-for-training-yolo-object-detector.html
- https://blog.insightdatascience.com/how-to-train-your-own-yolov3-detector-from-scratch-224d10e55de2 (https://github.com/AntonMu/TrainYourOwnYOLO)

In [None]:
import pandas as pd

import os

In [None]:
def convert_vott_csv_to_yolo(
    vott_df,
    labeldict=dict(zip(["Cat_Face"], [0,])),
    path="",
    target_name="data_train.txt",
    abs_path=False,
):

    # Encode labels according to labeldict if code's don't exist
    if not "code" in vott_df.columns:
        vott_df["code"] = vott_df["label"].apply(lambda x: labeldict[x])
    # Round float to ints
    for col in vott_df[["xmin", "ymin", "xmax", "ymax"]]:
        vott_df[col] = (vott_df[col]).apply(lambda x: round(x))

    # Create Yolo Text file
    last_image = ""
    txt_file = ""

    for index, row in vott_df.iterrows():
        if not last_image == row["image"]:
            if abs_path:
                txt_file += "\n" + row["image_path"] + " "
            else:
                txt_file += "\n" + os.path.join(path, row["image"]) + " "
            txt_file += ",".join(
                [
                    str(x)
                    for x in (row[["xmin", "ymin", "xmax", "ymax", "code"]].tolist())
                ]
            )
        else:
            txt_file += " "
            txt_file += ",".join(
                [
                    str(x)
                    for x in (row[["xmin", "ymin", "xmax", "ymax", "code"]].tolist())
                ]
            )
        last_image = row["image"]
    file = open(target_name, "w")
    file.write(txt_file[1:])
    file.close()
    return True

## Gas

In [None]:
# Prepare the dataset for YOLO
# Path to csv with labels (vott export)
multi_df = pd.read_csv("../data/labelled/gas/vott-csv-export/Hummel-Meter-export.csv")
multi_df.head()

In [None]:
labels = multi_df["label"].unique()
labeldict = dict(zip(labels, range(len(labels))))
multi_df.drop_duplicates(subset=None, keep="first", inplace=True)

In [None]:
# Path to images
train_path = "../data/labelled/gas/vott-csv-export/"
target_name = "../data/labelled/gas/vott-yolo-export/Hummel-Meter-export.yolo"

In [None]:
target_dir = os.path.dirname(target_name)
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

In [None]:
convert_vott_csv_to_yolo(
    multi_df, labeldict, path=train_path, target_name=target_name
)

## Strom

In [None]:
# Prepare the dataset for YOLO
# Path to csv with labels (vott export)
multi_df = pd.read_csv("../data/labelled/strom/vott-csv-export/Stromzaehler-export.csv")
multi_df.head()

In [None]:
labels = multi_df["label"].unique()
labeldict = dict(zip(labels, range(len(labels))))
multi_df.drop_duplicates(subset=None, keep="first", inplace=True)

In [None]:
# Path to images
train_path = "../data/labelled/strom/vott-csv-export/"
target_name = "../data/labelled/strom/vott-yolo-export/Hummel-Meter-export.yolo"

In [None]:
target_dir = os.path.dirname(target_name)
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

In [None]:
convert_vott_csv_to_yolo(
    multi_df, labeldict, path=train_path, target_name=target_name
)