# Import zone

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

In [None]:
!pip install pytesseract
!pip install deskew

In [None]:
import cv2
import PIL
from PIL import Image
from PIL import ImageDraw
import os
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import time
import numpy as np
import seaborn as sns
import pandas as pd
import shutil

from tqdm import tqdm
from tqdm.notebook import trange, tqdm
import pytesseract
import re
import seaborn as sns
sns.set_theme()

# OCR

## pytesseract

### preprocess

In [None]:
from deskew import determine_skew


# Rotate the image around its center
def rotateImage(cvImage, angle: float):
    if(angle > 15):
        angle = 15.0
    if(angle < -15):
        angle = -15.0
    newImage = cvImage.copy()
    (h, w) = newImage.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    newImage = cv2.warpAffine(newImage, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return newImage

# Deskew image
def deskew(cvImage):
    angle = determine_skew(cvImage)
    return rotateImage(cvImage, angle)

In [None]:
root = '/content/Dataset/images/images'

local=f'{root}/00182.jpg'
img = cv2.imread(local,0)
fixed = deskew(img)

plt.figure(figsize=(15, 10))
ax1 = plt.subplot(1,2,1)
img=cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
ax1.imshow(img)
ax1.axis('off')
plt.title("Original image")

ax2 = plt.subplot(1,2,2)
fixed =cv2.cvtColor(fixed , cv2.COLOR_BGR2RGB)
ax2.imshow(fixed)
ax2.axis('off')
plt.title("deskew image")

plt.show()

In [None]:
root = '/content/Dataset/images/images'
local=f'{root}/00182.jpg'
imgresize = cv2.imread(local)

#remove noise
dst = cv2.fastNlMeansDenoisingColored(imgresize, None, 5, 5, 7, 21)
# Grayscale
gray = cv2.cvtColor(dst, cv2.COLOR_BGR2GRAY)

deskew_ = deskew(gray)
imgresize2 = deskew_.copy()
gray_blur = cv2.GaussianBlur(deskew_,(15,15),0)
thresh=cv2.threshold(gray_blur, 180, 255, cv2.THRESH_BINARY +cv2.THRESH_OTSU)[1]
kernel=np.ones((5,14),np.uint8)
closing = cv2.erode(thresh,kernel,iterations = 2)
result_img=closing.copy()

contours,hierachy=cv2.findContours(result_img,cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE)
w,h= result_img.shape
print(len(contours))

kernel_sharp = np.array([[0,-1,0],
                         [-1,5,-1],
                         [0,-1,0]],dtype=np.float32)

plt.figure(figsize=(20, 2))

for i, cnt in enumerate(contours):
    area = cv2.contourArea(cnt)

    if i != len(contours)-1:
        ax = plt.subplot(2,8,i+1)
        (x,y,w,h)= cv2.boundingRect(cnt)

        result = imgresize2[y-3:y+h+3,x-2:x+w+5]
        h_,w_ = result.shape
        result = cv2.filter2D(result,-1,kernel_sharp)
        result = cv2.fastNlMeansDenoising(result, None, 20, 7, 21)
        result = cv2.resize(result, (w_*8,h_*8), interpolation = cv2.INTER_AREA)

        result = cv2.GaussianBlur(result,(11,11),0)
        result=cv2.threshold(result,180,255,cv2.THRESH_OTSU)[1]

        kernel2 = cv2.getStructuringElement(cv2.MORPH_RECT,(1,1))
        kernel3 = cv2.getStructuringElement(cv2.MORPH_RECT,(1,1))

        result = cv2.erode(result,kernel2,iterations = 1)
        result =cv2.dilate(result,kernel3,iterations = 10)

        result =cv2.morphologyEx(result,cv2.MORPH_OPEN,kernel2,iterations=5)
        #result =cv2.morphologyEx(result,cv2.MORPH_CLOSE,kernel2,iterations=4)

        cv2.imwrite('image_cut3/{}.jpg'.format(i),result)

        #forplot
        result=cv2.cvtColor(result, cv2.COLOR_BGR2RGB)
        ax.imshow(result)
        ax.axis('off')
        cv2.rectangle(imgresize2,(x,y),(x+w,y+h),(0,0,255),2)

ax.text(-180,-80, 'kernel_sharp image',fontsize=18 )

plt.figure(figsize=(20, 70))

def plot_step(img,n,text='image') :

    ax1 = plt.subplot(5,2,n)
    img=cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    ax1.imshow(img)
    ax1.axis('off')
    plt.title(text,fontsize=18)

plot_step(imgresize,1,text="Original image")
plot_step(dst,2,text="fastNlMeansDenoisingColored image")
plot_step(gray,3,text="Gray image")
plot_step(deskew_,4,text="deskew image")
plot_step(gray_blur,5,text="GaussianBlur image")
plot_step(thresh,6,text="THRESH_OTSU image")
plot_step(closing,7,text="Closing image (Erode)")
plot_step(imgresize2,8,text="detect text image")
plt.subplots_adjust(wspace=0.10, hspace=0.05)
plt.show()

In [None]:
from glob import glob

path_image_cut= 'image_cut4'
path_image = '/content/Dataset/images/images'
try: shutil.rmtree(path_image_cut)
except : pass
#Create Folder
os.mkdir(path_image_cut)


kernel_sharp = np.array([[0,-1,0],
                   [-1,5,-1],
                   [0,-1,0]],dtype=np.float32)

num=0
for index in tqdm(range(len(os.listdir(path_image)))):

    # print(path_image+'/'+os.listdir(path_image)[index])
    imgresize = cv2.imread(path_image+'/'+os.listdir(path_image)[index])

    # remove noise
    dst = cv2.fastNlMeansDenoisingColored(imgresize, None, 5, 5, 7, 21)

    # Grayscale
    gray = cv2.cvtColor(dst, cv2.COLOR_BGR2GRAY)
    deskew_ = deskew(gray)
    imgresize2 = deskew_
    gray_blur = cv2.GaussianBlur(deskew_,(15,15),0)
    thresh=cv2.threshold(gray_blur, 180, 255, cv2.THRESH_BINARY +cv2.THRESH_OTSU)[1]
    kernel=np.ones((5,14),np.uint8)
    closing = cv2.erode(thresh,kernel,iterations = 2)
    result_img=closing.copy()

    contours,hierachy=cv2.findContours(result_img,cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE)
    w,h= result_img.shape
    print(len(contours),index)
    num=14*(index+1)-1

    for i, cnt in enumerate(contours):
        area = cv2.contourArea(cnt)

        if i != len(contours)-1:
            (x,y,w,h)= cv2.boundingRect(cnt)
            result = imgresize2[y-3:y+h+3,x-2:x+w+5]
            h_,w_ = result.shape

            result = cv2.filter2D(result,-1,kernel_sharp)
            result = cv2.fastNlMeansDenoising(result, None, 20, 7, 21)
            result = cv2.resize(result, (w_*8,h_*8), interpolation = cv2.INTER_AREA)
            result = cv2.GaussianBlur(result,(11,11),0)
            result=cv2.threshold(result,180,255,cv2.THRESH_OTSU)[1]
            kernel2 = cv2.getStructuringElement(cv2.MORPH_RECT,(1,1))
            result =cv2.morphologyEx(result,cv2.MORPH_OPEN,kernel2,iterations=5)

            cv2.imwrite(path_image_cut+'/{}.jpg'.format(num),result)
            num-=1

In [None]:
!zip -r data.zip /content/image_cut4

### pytesseract

In [None]:
! sudo apt-get install libpng-dev libjpeg-dev libtiff-dev zlib1g-dev
! sudo apt-get install gcc g++
! sudo apt-get install autoconf automake libtool checkinstall

In [None]:
! cd ~
! wget http://www.leptonica.org/source/leptonica-1.73.tar.gz
! tar -zxvf leptonica-1.73.tar.gz
! cd leptonica-1.73
! ./configure
! make
! sudo checkinstall
! sudo ldconfig
! sudo apt-get install tesseract-ocr

In [None]:
! sudo tesseract --list-langs

In [None]:
! sudo apt-get install tesseract-ocr-tha
! sudo tesseract --list-langs

In [None]:
import pytesseract
print(pytesseract.get_tesseract_version())
print(pytesseract.get_languages())

In [None]:
list_num = [str(x) for x in range(1000)]
def clean_data(txt):
    txt=txt.replace('\n','')
    txt=txt.replace('|','')

    if ","  in txt:
        txt=txt[txt.index(',')+1:]
    elif '.' in txt:
        txt=txt[txt.index('.')+1:]
    elif ' ' in txt :
        txt=txt[txt.index(' ')+1:]
    elif ':' in txt :
        txt=txt[txt.index(':')+1:]

    # elif str(j) in txt :
    #     txt=txt[txt.index(str(j))+1:]

    txt=txt.replace(' ','')
    txt=txt.replace('-','')
    txt=txt.replace('๕','&')

    text=re.findall("[ก-๙]+", txt)
    num=re.findall("[0-9]+", txt)
    if (len(text) == len(num) != 0)  :
        txt=text[0]
    elif len(text) ==0 and len(num) >0 :
        if str(j) not in txt :
            txt=num[0]
        else : txt= '_'

    if txt == '' or txt == None:
        txt ='_'
    return txt


In [None]:
# test
def plot_step2(local,text='image') :
    img = cv2.imread(local)
    ax1 = plt.subplot(1,1,1)
    img=cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    ax1.imshow(img)
    ax1.axis('off')
    plt.title(text,fontsize=18)
    plt.show()

In [None]:
n=0
n+=1
for j in range(1):
#for j in range(0,14000,1):
# for j in range(14*n-14,14*n,1):
# for j in range(len(os.listdir("/content/image_cut4/"))):
    #local = r'/content/image_cut4/{}.jpg'.format(j)
    local = '/content/image_process_leawjingjing/0.jpg'
    img = Image.open(local)
    txt = pytesseract.image_to_string(local, lang='tha',config='--oem 3 -- psm13') # ocr
    txt= clean_data(txt,j)
    plot_step2(local)
    print(txt)
    print("output :",txt )


In [None]:
words = []
path_image_cut= '/content/image_process_leawjingjing'
image_list = os.listdir(path_image_cut)
print(len(image_list))
for i in tqdm(range(64568)):
    local = path_image_cut+'/'+str(i)+'.jpg'
    sub_local = str(i)+'.jpg'
    if(sub_local in image_list):
        txt = pytesseract.image_to_string(local, lang='tha',config='--oem 3') # --psm 13 --psm 8
        txt= clean_data(txt)
        words.append(txt)
    else:
        words.append('_')

In [None]:
len(words)

In [None]:
df = pd.DataFrame({'Id': list(range(64568)), 'words' : words})
df

In [None]:
#edit sth
for i in range(len(df)):
  target = df.loc[i, "words"]
  if(target == "ถา"):
    df.loc[i, "words"] = "กา"
  if(target == "เมือ")
  df.loc[i, "words"] = "เมือง"

In [None]:
df.to_csv("pure-data.csv", index=False)

## EASYOCR

### preprocess

gray scale + skewer + erosion

In [None]:
import sys
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image as im
from scipy.ndimage import interpolation as inter
from scipy.ndimage import rotate

In [None]:
from deskew import determine_skew


# Rotate the image around its center
def rotateImage(cvImage, angle: float):
    newImage = cvImage.copy()
    (h, w) = newImage.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    newImage = cv2.warpAffine(newImage, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return newImage

# Deskew image
def deskew(cvImage):
    angle = determine_skew(cvImage)
    return rotateImage(cvImage, angle)

In [None]:
# def deskew(imgs):
#   img = cv2.cvtColor(imgs, cv2.COLOR_BGR2RGB)
#   img = im.fromarray(img)
#   # convert to binary
#   wd, ht = img.size
#   def find_score(arr, angle):
#       data = rotate(arr, angle, reshape=False, order=0)
#       hist = np.sum(data, axis=1)
#       score = np.sum((hist[1:] - hist[:-1]) ** 2)
#       return hist, score
#   delta = 1
#   limit = 15
#   angles = np.arange(-limit, limit+delta, delta)
#   scores = []
#   for angle in angles:
#       hist, score = find_score(img, angle)
#       scores.append(score)
#   best_score = max(scores)
#   best_angle = angles[scores.index(best_score)]
#   #print(f'Best angle: {best_angle}')
#   # correct skew
#   data = rotate(img, best_angle, reshape=False, order=0)
#   img = im.fromarray(data)

#   return img

In [None]:
from google.colab.patches import cv2_imshow

In [None]:
root = '/content/gdrive/MyDrive/SuperAI/hack6/nithan-chadok-hybrid-ocr-ner/images/images'

local=root+'/00035.jpg'
img = cv2.imread(local)
#img = np.array(img)
#print(img.shape)
fixed = deskew(img)

plt.figure(figsize=(15, 10))
ax1 = plt.subplot(1,2,1)
img=cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
ax1.imshow(img)
ax1.axis('off')
plt.title("Original image")

ax2 = plt.subplot(1,2,2)
#fixed =cv2.cvtColor(fixed , cv2.COLOR_BGR2RGB)
ax2.imshow(fixed)
ax2.axis('off')
plt.title("deskew image")

plt.show()

In [None]:
root = '/content/gdrive/MyDrive/SuperAI/hack6/nithan-chadok-hybrid-ocr-ner/images/images'
local=root+'/00126.jpg'
imgresize = cv2.imread(local)
path_image_cut = 'image_cut_test'
try: shutil.rmtree(path_image_cut)
except : pass
os.mkdir(path_image_cut)

#remove noise
dst = cv2.fastNlMeansDenoisingColored(imgresize, None, 5, 5, 7, 21)
# Grayscale
gray = cv2.cvtColor(dst, cv2.COLOR_BGR2GRAY)

deskew_ = deskew(gray)
imgresize2 = deskew_.copy()
gray_blur = cv2.GaussianBlur(deskew_,(3,3),0)
thresh=cv2.threshold(gray_blur, 0, 255, cv2.THRESH_BINARY +cv2.THRESH_OTSU)[1]
kernel=np.ones((5,14),np.float32)
closing = cv2.erode(thresh,kernel,iterations = 2)
result_img=closing.copy()

contours,hierachy=cv2.findContours(result_img,cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE)
w,h= result_img.shape
print(len(contours))

kernel_sharp = np.array([[0,-1,0],
                         [-1,5,-1],
                         [0,-1,0]],dtype=np.float32)

plt.figure(figsize=(20, 2))
counter = 0

for i, cnt in enumerate(contours):
    area = cv2.contourArea(cnt)

    if i != len(contours)-1:
        ax = plt.subplot(2,8,i+1)
        (x,y,w,h)= cv2.boundingRect(cnt)

        result = imgresize2[y-3:y+h+3,x-2:x+w+5]
        h_,w_ = result.shape
        result = cv2.filter2D(result,-1,kernel_sharp)
        result = cv2.fastNlMeansDenoising(result, None, 20, 7, 21)
        result = cv2.resize(result, (w_*8,h_*8), interpolation = cv2.INTER_AREA)

        result = cv2.GaussianBlur(result,(11,11),0)
        result=cv2.threshold(result,0,255,cv2.THRESH_OTSU)[1]

        kernel2 = cv2.getStructuringElement(cv2.MORPH_RECT,(1,1))
        kernel3 = cv2.getStructuringElement(cv2.MORPH_RECT,(1,1))

        result = cv2.erode(result,kernel2,iterations = 1)
        result =cv2.dilate(result,kernel3,iterations = 10)

        result =cv2.morphologyEx(result,cv2.MORPH_OPEN,kernel2,iterations=5)
        #result =cv2.morphologyEx(result,cv2.MORPH_CLOSE,kernel2,iterations=4)

        cv2.imwrite(f'{path_image_cut}/{counter}.jpg',result)

        #forplot
        result=cv2.cvtColor(result, cv2.COLOR_BGR2RGB)
        ax.imshow(result)
        ax.axis('off')
        cv2.rectangle(imgresize2,(x,y),(x+w,y+h),(0,0,255),2)
    counter+=1

ax.text(-180,-80, 'kernel_sharp image',fontsize=18 )

plt.figure(figsize=(20, 70))

def plot_step(img,n,text='image') :

    ax1 = plt.subplot(5,2,n)
    img=cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    ax1.imshow(img)
    ax1.axis('off')
    plt.title(text,fontsize=18)

plot_step(imgresize,1,text="Original image")
plot_step(dst,2,text="fastNlMeansDenoisingColored image")
plot_step(gray,3,text="Gray image")
plot_step(deskew_,4,text="deskew image")
plot_step(gray_blur,5,text="GaussianBlur image")
plot_step(thresh,6,text="THRESH_OTSU image")
plot_step(closing,7,text="Closing image (Erode)")
plot_step(imgresize2,8,text="detect text image")
plt.subplots_adjust(wspace=0.10, hspace=0.05)
plt.show()

In [None]:
root = '/content/gdrive/MyDrive/SuperAI/hack6/nithan-chadok-hybrid-ocr-ner/images/images'
path_image_cut = 'image_process_leawjingjing'
try: shutil.rmtree(path_image_cut)
except : pass
os.mkdir(path_image_cut)

all_path = os.listdir(root)
all_path.sort()
#all_path

In [None]:
num = 0
for index in tqdm(range(len(all_path))):
    #print(all_path[index])
    imgresize = cv2.imread(root+'/'+all_path[index])
    #remove noise
    dst = cv2.fastNlMeansDenoisingColored(imgresize, None, 5, 5, 7, 21)
    # Grayscale
    gray = cv2.cvtColor(dst, cv2.COLOR_BGR2GRAY)

    deskew_ = deskew(gray)
    imgresize2 = deskew_.copy()
    gray_blur = cv2.GaussianBlur(deskew_,(3,3),0)
    thresh=cv2.threshold(gray_blur, 0, 255, cv2.THRESH_BINARY +cv2.THRESH_OTSU)[1]
    kernel=np.ones((5,14),np.float32)
    closing = cv2.erode(thresh,kernel,iterations = 2)
    result_img=closing.copy()

    contours,hierachy=cv2.findContours(result_img,cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE)
    w,h= result_img.shape
    print(index, len(contours))

    kernel_sharp = np.array([[0,-1,0],
                            [-1,5,-1],
                            [0,-1,0]],dtype=np.float32)

    plt.figure(figsize=(20, 2))
    counter = 0
    num=14*(index+1)-1

    for i, cnt in enumerate(contours):
        area = cv2.contourArea(cnt)

        if i != len(contours)-1:
            (x,y,w,h)= cv2.boundingRect(cnt)

            result = imgresize2[y-3:y+h+3,x-2:x+w+5]
            h_,w_ = result.shape
            if(not result.size==0):
                result = cv2.filter2D(result,-1,kernel_sharp)
                result = cv2.fastNlMeansDenoising(result, None, 20, 7, 21)
                result = cv2.resize(result, (w_*8,h_*8), interpolation = cv2.INTER_AREA)

                result = cv2.GaussianBlur(result,(11,11),0)
                result=cv2.threshold(result,0,255,cv2.THRESH_OTSU)[1]

                kernel2 = cv2.getStructuringElement(cv2.MORPH_RECT,(1,1))
                kernel3 = cv2.getStructuringElement(cv2.MORPH_RECT,(1,1))

                result = cv2.erode(result,kernel2,iterations = 1)
                result =cv2.dilate(result,kernel3,iterations = 10)

                result =cv2.morphologyEx(result,cv2.MORPH_OPEN,kernel2,iterations=5)
                #result =cv2.morphologyEx(result,cv2.MORPH_CLOSE,kernel2,iterations=4)

                cv2.imwrite(f'{path_image_cut}/{num}.jpg',result)

        counter+=1
        num-=1

In [None]:
!zip -r data.zip /content/image_process_leawjingjing

In [None]:
!cp -r /content/image_process_leawjingjing /content/gdrive/MyDrive/SuperAI/hack6/data_processed

In [None]:
%rm -rf image_process_leawja

In [None]:
%rm -rf image_cut
%rm -rf image_cut+erode
%rm -rf image_cut+test
%rm -rf image_cut3
%rm -rf image_cut_test
%rm -rf image_pro
%rm -rf image_process
%rm -rf image_process_cut

In [None]:
# from glob import glob

# path_image_cut= 'image_process_leawja'
# path_image = '/content/gdrive/MyDrive/SuperAI/hack6/nithan-chadok-hybrid-ocr-ner/images/images'
# try: shutil.rmtree(path_image_cut)
# except : pass
# #Create Folder
# os.mkdir(path_image_cut)


# kernel_sharp = np.array([[0,-1,0],
#                    [-1,5,-1],
#                    [0,-1,0]],dtype=np.float32)

# num=0
# counter = 0
# for index in tqdm(range(len(os.listdir(path_image)))):

#     # print(path_image+'/'+os.listdir(path_image)[index])
#     imgresize = cv2.imread(path_image+'/'+os.listdir(path_image)[index])

#     # remove noise
#     dst = cv2.fastNlMeansDenoisingColored(imgresize, None, 5, 5, 7, 21)

#     # Grayscale
#     gray = cv2.cvtColor(dst, cv2.COLOR_BGR2GRAY)

#     deskew_ = deskew(gray)
#     imgresize2 = deskew_.copy()
#     gray_blur = cv2.GaussianBlur(deskew_,(5, 5),0)
#     thresh=cv2.threshold(gray_blur, 0, 255, cv2.THRESH_BINARY +cv2.THRESH_OTSU)[1]
#     #kernel=np.ones((5,14),np.uint8)
#     kernel1 = np.ones((5, 14), np.float32)
#     closing = cv2.erode(thresh,kernel,iterations = 2)
#     result_img=closing.copy()

#     contours,hierachy=cv2.findContours(result_img,cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE)
#     w,h= result_img.shape
#     print(len(contours),index)
#     num=14*(index+1)-1
#     #counter+=14

#     for i, cnt in enumerate(contours):
#         area = cv2.contourArea(cnt)

#         if i != len(contours)-1:
#             (x,y,w,h)= cv2.boundingRect(cnt)
#             result = imgresize2[y-3:y+h+3,x-2:x+w+5]
#             h_,w_ = result.shape
#             #print(len(contours))

#             if(not result.size == 0):
#                 result = cv2.filter2D(result,-1,kernel_sharp)
#                 result = cv2.fastNlMeansDenoising(result, None, 20, 7, 21)
#                 result = cv2.resize(result, (w_*8,h_*8), interpolation = cv2.INTER_AREA)
#                 result = cv2.GaussianBlur(result,(11,11),0)
#                 result=cv2.threshold(result, 0,255,cv2.THRESH_OTSU)[1]
#                 kernel2 = cv2.getStructuringElement(cv2.MORPH_RECT,(1,1))
#                 kernel3 = cv2.getStructuringElement(cv2.MORPH_RECT,(1,1))

#                 result = cv2.erode(result, kernel2, iterations=1)
#                 result = cv2.dilate(result, kernel3, iterations=10)
#                 result =cv2.morphologyEx(result,cv2.MORPH_OPEN,kernel2,iterations=5)

#             #erode
#             #image = cv2.erode(result, kernel1)

#                 cv2.imwrite(path_image_cut+f'/{num-1}.jpg'.format(num),result)
#             num-=1
#             print(num)
#         #counter+=1

In [None]:
!zip -r data.zip /content/image_cut4

### easyocr model

In [None]:
! pip install -q easyocr

In [None]:
ls = os.listdir("/content/image_process_leawjingjing")
len(ls)

In [None]:
import easyocr
import pandas as pd
path = "/content/image_process_leawjingjing"
reader = easyocr.Reader(['th'])

words = []

# plt.figure(figsize=(20, 70))

# def plot_step(img,n,text='image') :

#     ax1 = plt.subplot(7,2,n)
#     img=cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
#     ax1.imshow(img)
#     ax1.axis('off')
#     plt.title(text,fontsize=18)

for index in tqdm(range(1)):
#for index in tqdm(range(len(os.listdir(path)))):
  #img = cv2.imread(path+'/'+os.listdir(path)[index])
  #print(path+'/'+os.listdir(path)[index])
  result = reader.readtext(path+'/'+'0.jpg')
  #plot_step(img, index+1)
  print(result)

# NLP

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

## test Spelling function

In [None]:
!pip install pythainlp

In [None]:
!pip install fasttext

In [None]:
# test function
from pythainlp import spell

target = "มีคุณคา"
pred = spell(target)

print(pred[0])
print(pred)

In [None]:
from pythainlp.spell import correct_sent

correct_sent(["มีคุณคา","งขานจักดาน","จึ่ง"])

In [None]:
from pythainlp.spell import NorvigSpellChecker

checker = NorvigSpellChecker()

checker.correct("มีคุณคา")

In [None]:
# test function
from pythainlp.spell import correct

target = "มีคุณคา"
pred = correct(target)
#pred = correct(target, engine="wanchanberta_thai_grammarly")

print(pred)

## spelling tune model

In [None]:
import pythainlp
import numpy as np

words = pythainlp.corpus.ttc.word_freqs()
#words = np.array(list(words))  # to array

In [None]:
words
all_words = []
for i in words:
  all_words.append(i[0])

In [None]:
all_words = np.array(all_words)
all_words.shape

In [None]:
df = thai_words()
df

In [None]:
words.shape

(62074,)

In [None]:
words_str = '\n'.join(all_words)
words_char = list(words_str)

In [None]:
with open('words-char.txt', mode='w', encoding='utf-8') as file:
    file.write(' '.join(words_char))

In [None]:
import fasttext

In [None]:
model = fasttext.train_unsupervised('words-char.txt',
                                    epoch=200,
                                    ws=3)

In [None]:
words_vec = [model.get_sentence_vector(' '.join(list(word))) for word in all_words]
words_vec = np.array(words_vec)

In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

X, y = words_vec, words
nbrs = NearestNeighbors().fit(X, y)

In [None]:
import joblib

model.save_model('char2vec.bin')  # fasttext model
joblib.dump(words, 'words.joblib')
joblib.dump(nbrs, 'nbrs.joblib');

In [None]:
import fasttext
import joblib

model = fasttext.load_model('char2vec.bin')
words = joblib.load('words.joblib')
nbrs = joblib.load('nbrs.joblib')

In [None]:
words_input = ['มีคุณคา', 'งขานจักดาน', 'หัตอกางน', 'ศิดปหัดดกรรม', 'ธรรมยาติ']

In [None]:
word_input_vec = [model.get_sentence_vector(' '.join(list(word))) for word in words_input]
indices = nbrs.kneighbors(word_input_vec, 5, False)  # n_neighbors is 5
suggestion = all_words[indices]

for w, s in zip(words_input, suggestion):
    print(f'{w} \n---> {s}')

## word beam search

In [None]:
! git clone https://github.com/githubharald/SimpleHTR

In [None]:
! git clone https://github.com/githubharald/CTCWordBeamSearch

In [None]:
! pip install CTCWordBeamSearch
%cd CTCWordBeamSearch
! pip install .

In [None]:
import numpy as np
from word_beam_search import WordBeamSearch

corpus = 'คามณี คุณงาม กามคุณ สมนาคุณ คุณค่า'  # two words "a" and "ba", separated by whitespace
chars = 'คุณคา '  # the characters that can be recognized (in this order)
word_chars = 'คุณา'  # characters that form words

# RNN output
# 3 time-steps and 4 characters per time time ("a", "b", " ", CTC-blank)
mat = np.array([
    [[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]],
    [[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]],
    [[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]],
    [[0.4, 0.6, 0.0, 0.0, 0.0, 0.0]]
])

# initialize word beam search (only do this once in your code)
wbs = WordBeamSearch(
    25,
    'Words', # "NGrams", "NGramsForecast", "NGramsForecastAndSample"
    0.0,
    corpus.encode('utf8'),
    chars.encode('utf8'),
    word_chars.encode('utf8')
)

# compute label string
label_str = wbs.compute(mat)

char_str = [''.join(chars[label] for label in curr_label_str) for curr_label_str in label_str]


char_str

## spelling checker

In [None]:
import pandas as pd
data = pd.read_csv("/content/gdrive/MyDrive/SuperAI/hack6/csv-keeper/df_test_new-BW_1.csv")

In [None]:
data

In [None]:
for i in range(len(data)):
  word = data.loc[i, "Text"]
  pred = correct(word)

  if(i%100==0):
    print(i)

  data.loc[i, "CS text"] = pred[0] #CS text = correct spelling text

In [None]:
data

In [None]:
data.to_csv("/content/gdrive/MyDrive/SuperAI/hack6/csv-keeper/spelling_test_1.csv", index=False)

## NER

In [None]:
!pip install -q simpletransformers

In [None]:
import torch
import pandas as pd
from simpletransformers.ner import NERModel, NERArgs

In [None]:
test_df = pd.read_csv("/content/cheat4.csv")

In [None]:
texts_test_raw = []
for i in range(len(test_df)):
  texts_test_raw.append(test_df.loc[i, "words"])

In [None]:
def blank_space(x):
  if x == '' or x == None or str(x)=='nan':
    x = '_'
  return x

#Loop replace blank to "_"
for i in range(len(texts_test_raw)):
  texts_test_raw[i] = blank_space(texts_test_raw[i])

In [None]:
def split_into_sentences(tokens, tokens_per_sentence=40):
    sentences = []
    for i in range(0, len(tokens), tokens_per_sentence):
        sentence = tokens[i:i+tokens_per_sentence]
        sentences.append(sentence)
    return sentences

In [None]:
my_token = split_into_sentences(texts_test_raw)

In [None]:
sent_join = ' '.join(my_token[0])
type(sent_join)

In [None]:
my_token_list = []
for i in range(len(my_token)):
  sent_join = ' '.join(my_token[i])
  #print(sent_join)
  my_token_list.append(sent_join)

In [None]:
_NER_TAGS = [
        "O",
        "B_BRN",
        "B_DES",
        "B_DTM",
        "B_LOC",
        "B_MEA",
        "B_NUM",
        "B_ORG",
        "B_PER",
        "B_TRM",
        "B_TTL",
        "I_BRN",
        "I_DES",
        "I_DTM",
        "I_LOC",
        "I_MEA",
        "I_NUM",
        "I_ORG",
        "I_PER",
        "I_TRM",
        "I_TTL",
        "E_BRN",
        "E_DES",
        "E_DTM",
        "E_LOC",
        "E_MEA",
        "E_NUM",
        "E_ORG",
        "E_PER",
        "E_TRM",
        "E_TTL",
    ]

In [None]:
# Test Model
ner_args = NERArgs()
ner_args.eval_batch_size = 128
ner_args.use_multiprocessing = True
model = NERModel(
     "camembert", "/content/gdrive/MyDrive/SuperAI/model/best_model_wangchanberta_addarg", args=ner_args, use_cuda=torch.cuda.is_available(), labels=_NER_TAGS  # your latest model
)

In [None]:
predictions, raw_outputs = model.predict(my_token, False)

In [None]:
#Extract data value from dict list
final_test_df = []
for i in range(len(predictions)):
  for j in range(len(predictions[i])):
    data = predictions[i][j]
    #print(data, type(data))
    value = data.values()
    final_test_df += value

In [None]:
len(final_test_df)

tag2idx

In [None]:
tag_list = pd.read_csv("/content/gdrive/MyDrive/SuperAI/tag_list.csv")

In [None]:
tag_list

In [None]:
tag2class = {}
for i in range(len(tag_list)):
  tag2class[tag_list.loc[i, "tag"]] = tag_list.loc[i, "class"]

In [None]:
tag2class

In [None]:
final_result = pd.DataFrame(final_test_df, columns=["pred"])
final_result

In [None]:
for i in range(len(final_result)):
  final_result.loc[i,"pred2id"] = str(tag2class[final_result.loc[i, "pred"]])

In [None]:
# checking unique tag

x = final_result["pred"].unique()
print(len(x))
print(x)

19

['O' 'B_LOC' 'E_LOC' 'B_PER' 'B_TTL' 'E_PER' 'B_NUM' 'E_ORG' 'B_MEA'

 'B_DTM' 'I_PER' 'B_ORG' 'E_MEA' 'I_LOC' 'I_MEA' 'I_ORG' 'B_DES' 'E_DTM'

 'I_DTM']


In [None]:
for i in range(len(test_df)):
    target = test_df.loc[i, "words"]

    #animal
    ani = "ราชสีห์,เสือ,เสือโคร่ง,ลา,แร้ง,หงส์,เสือดาว,หงส์,กา,นกหัวขวาน,จระเข้,นกยูง,สุนัข,สุนัขจิ้งจอก,หมี,แพะ,พญาลิง,ลิง,พญาหงส์ทอง,หงส์ทอง,แร้ง,ช้าง"
    ani_ls = ani.split(',')
    if(target in ani_ls):
        final_result.loc[i, "pred"] = "0"
    #นายพราน
    if("นายพราน" in target):
        final_result.loc[i, "pred"] = "0"
    #prefix
    if(target in ["ท้าว", "พญา", "ท่าน", "พระ"]):
        final_result.loc[i, "pred"] = "9"
    #prefix sth
    if(target in ["ท้าว", "เจ้า"]):
        final_result.loc[i, "pred"] = "11"
    #มีแต่่ 12
    if(final_result.loc[i, "pred"] == "12"):
        final_result.loc[i-1, "pred"] = "3"
    #"_" + สรรพนามแทนตัวเอง
    if(target in ["_", "เรา", "ข้าพเจ้า"]):
        final_result.loc[i, "pred"] = "0"
    # ที่ + order number
    if (target == "ที่") and str(test_df.loc[i+2, "word"]).isdigit() :
        final_result.loc[i, "pred"] = "4"
    # pair of 4 and 17
    if(final_result.loc[i, "pred"]=="4") and (final_result.loc[i+2, "pred"]=="17"):
        final_result.loc[i+1, "pred"] = "15"
    # place
    if(target in ["วัด", "เมือง", "ป่า"]):
        final_result.loc[i, "pred"] = "3"
        final_result.loc[i+1, "pred"] = "12"

In [None]:
submission = pd.read_csv("/content/gdrive/MyDrive/SuperAI/sample_submission.csv")

In [None]:
submission["pred"] = final_result["pred2id"]

In [None]:
submission

In [None]:
submission.to_csv("prediction-3k.csv", index=False)