In [None]:
'''
original code source : https://www.kaggle.com/aidapearson/eda-starter-notebook
'''

In [None]:
%matplotlib inline
import glob
import json
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from PIL import Image

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
def create_data_frame(default_image_path:str,data_path:str,level_path:str,source_path:str):
    data = {}
    data['latex'] = []
    data["latex_str"] = []
    data['seq_len'] = []
    data['aspect_ratio'] = []
    data['image_width'] = []
    data['image_height'] = []
    data['level']=[]
    data['source']=[]
    data["img_name"] = []
    all_latex_list = []
    lv1_latex_list=[]
    lv2_latex_list=[]
    lv3_latex_list=[]
    lv4_latex_list=[]
    lv5_latex_list=[]
    
    with open(level_path) as f:
        level_info={}
        for line in f:
            path,level=line.replace("\n","").split("\t")
            level_info[path]=int(level)
            data["img_name"].append(path)
            
    with open(source_path) as f:
        source_info={}
        for line in f:
            path,source=line.replace("\n","").split("\t")
            source_info[path]=str(source)
            
    with open(data_path) as f:
        for idx,line in enumerate(f):
            image_path,latex=line.replace("\n","").split("\t")
            image = Image.open(default_image_path+image_path)
            width, height = image.size
            data['aspect_ratio'].append(round(width / height,1))
            data['image_width'].append(int(width))
            data['image_height'].append(int(height))
            data["latex_str"].append(latex)
            latex=latex.split(" ")
            data['latex'].append(latex)
            data['seq_len'].append(len(latex))
            level=level_info[image_path]
            data['level'].append(level)
            source=source_info[image_path]
            data['source'].append(source)
            all_latex_list += latex
            if level==1:
                lv1_latex_list += latex
            elif level==2:
                lv2_latex_list += latex
            elif level==3:
                lv3_latex_list += latex
            elif level==4:
                lv4_latex_list += latex
            else:
                lv5_latex_list += latex
    df = pd.DataFrame.from_dict(data)
    lv_latex_list=[lv1_latex_list,lv2_latex_list,lv3_latex_list,lv4_latex_list,lv5_latex_list]
    return df, all_latex_list,lv_latex_list

In [None]:
df, all_latex_list, lv_latex_list = create_data_frame(default_image_path="/opt/ml/input/data/train_dataset/images/",\
       data_path="/opt/ml/input/data/train_dataset/gt.txt",level_path="/opt/ml/input/data/train_dataset/level.txt",source_path="/opt/ml/input/data/train_dataset/source.txt")

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
def plot_dist(df, field, bins, color, xlabel, ylabel, title):
    sns.set(color_codes=True)
    fig, ax = plt.subplots(figsize=(18,6))
    sns.distplot(df[field], bins=bins, color=color, ax=ax)
    ax.set_xlabel(xlabel, fontsize=13)
    ax.set_ylabel(ylabel, fontsize=13)
    ax.set_title(title, fontsize=20)
    plt.show()

In [None]:
plot_dist(df=df, field='seq_len', bins=50, color='b', xlabel='Sequence Length', \
          ylabel='Frequency', title='Sequence Length Distribution (Train 100k Images)')

In [None]:
plot_dist(df=df, field='aspect_ratio', bins=10, color='r', xlabel='Aspect Ratio (Image Width / Image Height)', \
          ylabel='Frequency', title='Aspect Ratio Distribution (Train 100k Images)')

In [None]:
g = sns.jointplot("image_width", "image_height", data=df, kind="kde", space=0, color="r")
g.set_axis_labels("Image Width", "Image Height")

In [None]:
def create_count_df(df, field, index,top_token):
    count=df.groupby(field)[index].count().sort_values(ascending=False)
    count_df = count.to_frame().reset_index()
    count_df.columns = [field, field + '_count']
    return count_df[:top_token]

def plot_count_df(df, field, random_sample, color, rotation, xlabel, ylabel, title):
    fig, ax = plt.subplots(figsize=(18,6))
    if random_sample:
        df = df.sample(n=50, random_state=1)
    ax.bar(df[field], df[field + '_count'], color=color, align='center',alpha=0.5)
    ax.set_xticklabels(df[field],rotation=rotation, fontsize=13)
    ax.set_xlabel(xlabel, fontsize=13)
    ax.set_ylabel(ylabel, fontsize=13)
    ax.set_title(title, fontsize=20)
    plt.show()

default_image_path="/opt/ml/input/data/images/"

def latex_to_img(img_num:int):
    a = df.iloc[img_num]['latex_str']
    ax = plt.axes([0,0,0.2,0.2]) #left,bottom,width,height
    ax.set_xticks([])
    ax.set_yticks([])
    ax.axis('off')
    plt.text(0.3,0.5,'$%s$' %a, size=35,color="green")
    plt.show()

def origin_img(img_num:int):
    img_name = df.iloc[img_num]['img_name']
    img = mpimg.imread(default_image_path+img_name)
    plt.grid(True, color='w')
    plt.imshow(img)
    plt.show()

def compare_img_gt(img_num:int, visible_infos=False):
    # original img
    img_name = df.iloc[img_num]['img_name']
    img = mpimg.imread(default_image_path+img_name)
    plt.grid(True, color='w')
    plt.imshow(img)
    plt.show()
    
    box1 = {'boxstyle': 'round',
        'ec': (0.5, 0.3, 0.3), # edgecolor
        'fc': (0.5, 0.8, 0.8)} # facecolor
    
    # ground truth latex to img
    a = df.iloc[img_num]['latex_str']
    ax = plt.axes([0,0,0.2,0.2]) #left,bottom,width,height
    ax.set_xticks([])
    ax.set_yticks([])
    ax.axis('off')
    plt.text(0.3,0.5,'$%s$' %a,size=20, bbox=box1)
    plt.show()
    
    if visible_infos:
        display(df.iloc[img_num][1:])

In [None]:
token_df = pd.DataFrame(all_latex_list, columns =['token'])
token_df['index']=token_df.index

In [None]:
token_count_df = create_count_df(df=token_df, field='token', index='index',top_token=20)
plot_count_df(df=token_count_df, field='token', random_sample=False, color='g', rotation=90, xlabel='Token', \
              ylabel='Number of Tokens', title='Token Distribution (10k Images)')

In [None]:
level_1 = df[df['level'] == 1]
level_2 = df[df['level'] == 2]
level_3 = df[df['level'] == 3]
level_4 = df[df['level'] == 4]
level_5 = df[df['level'] == 5]
level_dfs=[level_1,level_2,level_3,level_4,level_5]

In [None]:
for level,df in enumerate(level_dfs):
    plot_dist(df=df, field='seq_len', bins=50, color='b', xlabel='Sequence Length', \
          ylabel='Frequency', title='Sequence Length Distribution (level {})'.format(level+1))
    plot_dist(df=df, field='aspect_ratio', bins=10, color='r', xlabel='Aspect Ratio (Image Width / Image Height)', \
          ylabel='Frequency', title='Aspect Ratio Distribution (level {})'.format(level+1))
    g = sns.jointplot("image_width", "image_height", data=df, kind="kde", space=0, color="r")
    g.set_axis_labels("Image Width", "Image Height")
    token_df = pd.DataFrame(lv_latex_list[level], columns =['token'])
    token_df['index']=token_df.index
    token_count_df = create_count_df(df=token_df, field='token', index='index',top_token=20)
    plot_count_df(df=token_count_df, field='token', random_sample=False, color='g', rotation=90, xlabel='Token', \
              ylabel='Number of Tokens', title='Token Distribution (level {})'.format(level+1))
    

In [None]:
level=0
level_dfs[level].describe()

In [None]:
plot_dist(df=level_dfs[level], field='seq_len', bins=50, color='b', xlabel='Sequence Length', \
      ylabel='Frequency', title='Sequence Length Distribution (level {})'.format(level+1))
plot_dist(df=level_dfs[level], field='aspect_ratio', bins=10, color='r', xlabel='Aspect Ratio (Image Width / Image Height)', \
      ylabel='Frequency', title='Aspect Ratio Distribution (level {})'.format(level+1))
g = sns.jointplot("image_width", "image_height", data=level_dfs[level], kind="kde", space=0, color="r")
g.set_axis_labels("Image Width", "Image Height")
token_df = pd.DataFrame(lv_latex_list[level], columns =['token'])
token_df['index']=token_df.index
token_count_df = create_count_df(df=token_df, field='token', index='index',top_token=20)
plot_count_df(df=token_count_df, field='token', random_sample=False, color='g', rotation=90, xlabel='Token', \
          ylabel='Number of Tokens', title='Token Distribution (level {})'.format(level+1))

In [None]:
level=1
level_dfs[level].describe()

In [None]:
plot_dist(df=level_dfs[level], field='seq_len', bins=50, color='b', xlabel='Sequence Length', \
      ylabel='Frequency', title='Sequence Length Distribution (level {})'.format(level+1))
plot_dist(df=level_dfs[level], field='aspect_ratio', bins=10, color='r', xlabel='Aspect Ratio (Image Width / Image Height)', \
      ylabel='Frequency', title='Aspect Ratio Distribution (level {})'.format(level+1))
g = sns.jointplot("image_width", "image_height", data=level_dfs[level], kind="kde", space=0, color="r")
g.set_axis_labels("Image Width", "Image Height")
token_df = pd.DataFrame(lv_latex_list[level], columns =['token'])
token_df['index']=token_df.index
token_count_df = create_count_df(df=token_df, field='token', index='index',top_token=20)
plot_count_df(df=token_count_df, field='token', random_sample=False, color='g', rotation=90, xlabel='Token', \
          ylabel='Number of Tokens', title='Token Distribution (level {})'.format(level+1))

In [None]:
level=2
level_dfs[level].describe()

In [None]:
plot_dist(df=level_dfs[level], field='seq_len', bins=50, color='b', xlabel='Sequence Length', \
      ylabel='Frequency', title='Sequence Length Distribution (level {})'.format(level+1))
plot_dist(df=level_dfs[level], field='aspect_ratio', bins=10, color='r', xlabel='Aspect Ratio (Image Width / Image Height)', \
      ylabel='Frequency', title='Aspect Ratio Distribution (level {})'.format(level+1))
g = sns.jointplot("image_width", "image_height", data=level_dfs[level], kind="kde", space=0, color="r")
g.set_axis_labels("Image Width", "Image Height")
token_df = pd.DataFrame(lv_latex_list[level], columns =['token'])
token_df['index']=token_df.index
token_count_df = create_count_df(df=token_df, field='token', index='index',top_token=20)
plot_count_df(df=token_count_df, field='token', random_sample=False, color='g', rotation=90, xlabel='Token', \
          ylabel='Number of Tokens', title='Token Distribution (level {})'.format(level+1))

In [None]:
level=3
level_dfs[level].describe()

In [None]:
plot_dist(df=level_dfs[level], field='seq_len', bins=50, color='b', xlabel='Sequence Length', \
      ylabel='Frequency', title='Sequence Length Distribution (level {})'.format(level+1))
plot_dist(df=level_dfs[level], field='aspect_ratio', bins=10, color='r', xlabel='Aspect Ratio (Image Width / Image Height)', \
      ylabel='Frequency', title='Aspect Ratio Distribution (level {})'.format(level+1))
g = sns.jointplot("image_width", "image_height", data=level_dfs[level], kind="kde", space=0, color="r")
g.set_axis_labels("Image Width", "Image Height")
token_df = pd.DataFrame(lv_latex_list[level], columns =['token'])
token_df['index']=token_df.index
token_count_df = create_count_df(df=token_df, field='token', index='index',top_token=20)
plot_count_df(df=token_count_df, field='token', random_sample=False, color='g', rotation=90, xlabel='Token', \
          ylabel='Number of Tokens', title='Token Distribution (level {})'.format(level+1))

In [None]:
level=4
level_dfs[level].describe()

In [None]:
plot_dist(df=level_dfs[level], field='seq_len', bins=50, color='b', xlabel='Sequence Length', \
      ylabel='Frequency', title='Sequence Length Distribution (level {})'.format(level+1))
plot_dist(df=level_dfs[level], field='aspect_ratio', bins=10, color='r', xlabel='Aspect Ratio (Image Width / Image Height)', \
      ylabel='Frequency', title='Aspect Ratio Distribution (level {})'.format(level+1))
g = sns.jointplot("image_width", "image_height", data=level_dfs[level], kind="kde", space=0, color="r")
g.set_axis_labels("Image Width", "Image Height")
token_df = pd.DataFrame(lv_latex_list[level], columns =['token'])
token_df['index']=token_df.index
token_count_df = create_count_df(df=token_df, field='token', index='index',top_token=20)
plot_count_df(df=token_count_df, field='token', random_sample=False, color='g', rotation=90, xlabel='Token', \
          ylabel='Number of Tokens', title='Token Distribution (level {})'.format(level+1))

In [None]:
img_nb = 0
print("---------------- print latex to img ----------------")
latex_to_img(img_num = img_nb)
print("---------------- print original img ----------------")
origin_img(img_num = img_nb)
print("---------------- print all ----------------")
compare_img_gt(img_num = img_nb, visible_infos = True)