<a href="https://colab.research.google.com/github/i12playwow/whisper-standalone-win/blob/main/notebook/EasyOCR_4a_VSF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# EasyOCR to process videosubfinder images

1. Default location is TXTImages folder in your Google Drive.
2. Default detection language is Simplified Chinese. Please check your source.
3. Results are saved in TXTResults folder.

Happy subbing!


In [9]:
#@markdown **Installing EasyOCR --please wait, it takes 1 minute**
%%capture
!pip install git+https://github.com/JaidedAI/EasyOCR.git
!pip install tqdm


In [10]:
# Define the mapping of language names to codes
language_codes = {
    'Arabic': 'ar',
    'Simplified Chinese': 'ch_sim',
    'Traditional Chinese': 'ch_tra',
    'German': 'de',
    'English': 'en',
    'Spanish': 'es',
    'Persian (Farsi)': 'fa',
    'French': 'fr',
    'Hindi': 'hi',
    'Indonesian': 'id',
    'Italian': 'it',
    'Japanese': 'ja',
    'Korean': 'ko',
    'Russian': 'ru',
    'Swedish': 'sv',
    'Thai': 'th',
    'Vietnamese': 'vi',
}

language = 'Traditional Chinese'  # @param ['Simplified Chinese', 'Traditional Chinese', 'English', 'Japanese', 'Korean']

# Get the corresponding language code
language_code = language_codes.get(language)

print(f'Selected Language: {language}, Language Code: {language_code}')



Selected Language: Traditional Chinese, Language Code: ch_tra


In [11]:
#@markdown **Connect your Google Drive.**
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
folder = "/content/drive/MyDrive/TXTImages"  # @param {type:"string"}

In [13]:
#@markdown **Global definitions**
extensions=[".jpg",".png",".jpeg",".bmp"]
directoryDefault=folder
languagesDefault="ch_sim"
import os
import argparse
import shutil
import easyocr
from tqdm import tqdm
from google.colab import files

def runEasyOCR(langs=language_code, directory="/content/drive/MyDrive/TXTImages"):
    if not os.path.isdir(directory):
        print ("Not exists directory: " + directory )
        return

    parentDirectory = os.path.dirname(directory)
    directoryTXTResults = os.path.join(parentDirectory, "TXTResults")
    if os.path.isdir(directoryTXTResults):
        directoryTxt = directoryTXTResults
    else:
        os.makedirs(directoryTXTResults)
        print(f"Directory {directoryTXTResults} created.")
        directoryTxt = directoryTXTResults

    os.system("title OCR for " + directory + " - " + langs)
    reader = easyocr.Reader( langs.replace(" ","").split(",") )

    files = sorted([x for x in os.listdir(directory) if os.path.splitext(x)[1] in extensions])
    print("\n\n=== EasyOCR detecting models loaded. Now will start image processing ===")
    print(f"Starting to process {len(files)} image files...")
    print(f"The results are stored in folder: {directoryTXTResults} in your Google Drive...\n")


    for i,x in enumerate(tqdm(files, desc="Processing images", unit="image")):
        fileImage = os.path.join(directory,x)
        fileTxt = os.path.join(directoryTxt,x)
        result = reader.readtext(fileImage,detail=0, paragraph=True)
        with open(fileTxt+".txt", "w", encoding="utf-8") as f:
            f.write( " ".join(result) )

    # Return the path to the output directory
    return directoryTxt


In [None]:
#@markdown **Run EasyOCR. Results will be in TXTResults folder**
if __name__ == "__main__":
    outputfolder = runEasyOCR(language_code, directoryDefault)

    # zip and download results


In [18]:
#@markdown **Run EasyOCR. Results will be in TXTResults folder**
if __name__ == "__main__":
    outputfolder = runEasyOCR(language_code, directoryDefault)
    # Check if outputfolder is not None before proceeding
    if outputfolder is not None:
        # zip and download results
        directory_to_zip = outputfolder
        base_name = os.path.basename(directory_to_zip)  # Use the directory name for the zip file
        shutil.make_archive(base_name, 'zip', directory_to_zip)
        files.download(base_name + ".zip")
#@markdown **Run EasyOCR. Results will be in TXTResults folder**
if __name__ == "__main__":
    outputfolder = runEasyOCR(language_code, directoryDefault)
    # Check if outputfolder is not None before proceeding
    if outputfolder is not None:
        # zip and download results
        directory_to_zip = outputfolder
        base_name = os.path.basename(directory_to_zip)  # Use the directory name for the zip file
        shutil.make_archive(base_name, 'zip', directory_to_zip)
        files.download(base_name + ".zip")
        print(f"{os.path.basename(directory_to_zip)} folder content is zipped and downloaded")

        print("\n=== All Done")

Not exists directory: /content/drive/MyDrive/TXTImages
Not exists directory: /content/drive/MyDrive/TXTImages


# New Section