In [75]:
import re
import warnings; warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from PIL import Image
import pytesseract

In [452]:
# Convert Image to Black and White

# img = Image.open('test2.png')
# img = Image.open("test1cut.jpg")
# img = img.convert('P') 
# img = img.convert('L') 
# img = img.convert('RGBA') 
# threshold = 220
# array = np.array(img)
# array = np.where(array > threshold, 255, array) #0:黑, 255:白
# img = Image.fromarray(array.astype('uint8')) 
# img.save('out.png')
# text = pytesseract.image_to_string(img, lang='chi_tra')
# print(text)

In [123]:
class TransformImg:
    def __init__(self, img):
        self.img = img
        self.config = r'--psm 12 --dpi 300' # 6, 11 and 12 are more accurate
        self.output_type = 'dict'
        self.lang = 'chi_tra'
    
    def img_to_dataframe(self):
        data = pytesseract.image_to_data(self.img, output_type=self.output_type, lang=self.lang, config=self.config)
        df = pd.DataFrame(data)
        return df
    
    def pre_filter(self, df):
        mask = (
            (df['level'] == 5) & 
            (df['conf'] > 0)
        )

        df_text = df[mask]
        df_text = df_text.drop(['page_num', 'par_num', 'word_num'], axis=1)
        return df_text

    def create_full_text(self, df_text):
        df_full_text = df_text[['block_num', 'line_num', 'text']].copy()
        df_full_text = df_full_text.groupby(['block_num', 'line_num'], as_index=False).transform(lambda x: ''.join(x))
        df_text['full_text'] = df_full_text['text']
        return df_text

    def post_filter(self, df_text):

        def find_zh(x):
            if re.search(u'[\u4e00-\u9fff]', x):
                return True
            return False
            
        df_text = df_text.drop_duplicates(['block_num', 'line_num', 'full_text'])
        df_text = df_text.drop(['level', 'block_num', 'line_num'], axis=1)
        mask = (
            (df_text['full_text'].str.len() > 3)
            & (df_text['full_text'].str.len() < 10)
            & (df_text['text'] != '一')
            & (df_text['left'] > 190)
            & (df_text['top'] > 200)
        )
        df_text = df_text[mask]
        df_text = df_text[df_text['full_text'].apply(find_zh)]
        return df_text
    
    def run(self):
        df = self.img_to_dataframe()
        df_text = self.pre_filter(df)
        df_text = self.create_full_text(df_text)
        df_text = self.post_filter(df_text)
        return df_text

In [127]:
img = Image.open("../img/test3.png")
transform_img = TransformImg(img)
df_text = transform_img.run()
df_text

Unnamed: 0,left,top,width,height,conf,text,full_text
20,243,264,65,21,83,Lv.9,Lv.9幼基拉斯
75,315,975,100,35,96,活力,活力填充S
106,204,1289,86,29,96,技能,技能機率提升S
114,730,1289,35,29,92,持,持有上限提升S
137,747,1466,91,29,93,幫忙,幫忙速度S
143,204,1465,86,30,96,技能,技能等級提升S
162,200,1642,29,30,96,技能,技能機率提升M
182,587,2019,101,34,96,食材,食材發現率條全
198,586,2090,97,33,96,活力,活力回復


In [125]:
img = Image.open("../img/test1.png")
transform_img = TransformImg(img)
df_text = transform_img.run()
df_text

Unnamed: 0,left,top,width,height,conf,text,full_text
26,243,264,66,21,88,Lv.6,Lv.6樹才怪
62,314,939,64,34,90,能,能量填充M
95,219,1252,35,29,92,持,持有上限提升S
104,715,1252,85,29,96,食材,食材機率提升S
125,204,1429,72,29,96,技能,技能機率提升$
133,747,1429,89,29,86,幫忙,幫忙速度S
152,216,1606,73,30,94,睡眠,睡眠EXP獎勵
165,589,1987,63,26,93,EXP,EXP獲得量條全
180,586,2053,63,33,96,主,主技能發動機率


In [126]:
img = Image.open("../img/test2.png")
transform_img = TransformImg(img)
df_text = transform_img.run()
df_text

Unnamed: 0,left,top,width,height,conf,text,full_text
36,243,264,75,21,89,Lv.13,Lv.13皮卡
71,314,974,65,35,92,能,能量填充S
101,203,1287,88,30,96,技能,技能機率提升S
109,724,1288,37,29,49,燒,燒有上限提升M
128,219,1465,35,29,92,持,持有上限提升S
137,715,1464,86,30,96,食材,食材機率提升$
152,237,1642,91,29,94,幫忙,幫忙速度S
163,587,2018,93,34,95,活力,活力回復量人條
177,586,2089,63,33,96,主,主技能發動機率
