In [17]:
import pandas as pd 
import cv2, numpy as np
import random
from scipy.stats import norm
from scipy.misc import imread, imsave
import re
import pickle
import idx2numpy as idx
from copy import copy
import os
from datetime import datetime 
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.image as mpimg
%matplotlib inline

from sklearn.metrics import precision_recall_fscore_support, f1_score, accuracy_score, precision_score, recall_score
from sklearn.manifold import TSNE

# keras
np.random.seed(13)
import tensorflow as tf
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, Reshape, Activation, SimpleRNN, GRU, LSTM, Convolution1D, \
                         MaxPooling1D, Merge, Dropout, Input
from IPython.display import SVG
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
#from keras.utils.visualize_util import model_to_dot, plot
from keras.utils.vis_utils import model_to_dot, plot_model
from keras.datasets import imdb, reuters
from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adam
from keras import backend as K
from keras.layers import Input, Flatten, Dense, Dropout, Activation, Lambda, Layer # keras.layers.core 
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.layers.normalization import BatchNormalization
from keras.datasets import mnist
from keras.applications.vgg16 import VGG16
from keras import metrics
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img

# logging
import logging
from importlib import reload
reload(logging)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [7]:
from copy import copy

In [8]:
rootpath = "../EgyptianHieroglyphDataset/MyTrainTest"
original_dim = (75, 50) # without convolutional layers, use 75 * 50
batch_size = 32 * 4 # this has to be the same as or multiplication of the param value in the analysis code

### training file list

In [9]:
tr_rootpath = os.path.join(rootpath, "train")
symbol_df = [x for x in os.listdir(tr_rootpath) if re.search(r"UNKNOWN|^\.", x) == None]
symbol_df = pd.DataFrame(sorted(symbol_df))
symbol_df.reset_index(inplace = True)
symbol_df.columns = ["symbol_num", "symbol"]
symbol_df.tail(1)

Unnamed: 0,symbol_num,symbol
170,170,Z7


In [10]:
tr_rootpath = os.path.join(rootpath, "train")
subdirs = [x for x in os.listdir(tr_rootpath) if re.search(r"UNKNOWN|^\.", x) == None]
tr_fnames = pd.DataFrame(columns = ["fname"])
for subdirpath in subdirs:
    subdir = os.path.join(tr_rootpath, subdirpath)
    files = [x for x in os.listdir(subdir) if re.search(r"UNKNOWN|^\.", x) == None]
    for fpath in files:        
        pngfile = os.path.join(subdir, fpath)
        tr_fnames = pd.concat([tr_fnames, pd.DataFrame({"fname": pngfile}, index = [0])])
tr_fnames.reset_index(drop = True, inplace = True)

In [11]:
tr_fnames.tail(1)['fname'].tolist()[0]

'../EgyptianHieroglyphDataset/MyTrainTest/train/G25/gen4_410090_G25.png'

In [12]:
tr_fnames['symbol'] = tr_fnames['fname'].map(lambda x: re.sub(r".*\_(?P<label>.*)\.png", "\g<label>", x))

In [13]:
tr_fnames.tail(1)

Unnamed: 0,fname,symbol
43985,../EgyptianHieroglyphDataset/MyTrainTest/train...,G25


In [14]:
print(tr_fnames.shape)
tr_fnames = tr_fnames.merge(symbol_df, on = "symbol")
print(tr_fnames.shape)

(43986, 2)
(43986, 3)


In [15]:
tr_fnames.tail(6)

Unnamed: 0,fname,symbol,symbol_num
43980,../EgyptianHieroglyphDataset/MyTrainTest/train...,G25,52
43981,../EgyptianHieroglyphDataset/MyTrainTest/train...,G25,52
43982,../EgyptianHieroglyphDataset/MyTrainTest/train...,G25,52
43983,../EgyptianHieroglyphDataset/MyTrainTest/train...,G25,52
43984,../EgyptianHieroglyphDataset/MyTrainTest/train...,G25,52
43985,../EgyptianHieroglyphDataset/MyTrainTest/train...,G25,52


In [19]:
with open(os.path.join(rootpath, "tr_filelist.pkl"), "wb") as picklefile:
    pickle.dump(tr_fnames, picklefile)

In [33]:
print(tr_fnames.shape)

(43986, 3)


### testing file list

In [24]:
ts_rootpath = os.path.join(rootpath, "test")
files = [x for x in os.listdir(ts_rootpath) if re.search(r"UNKNOWN|^\.", x) == None]
ts_fnames = pd.DataFrame(columns = ["fname"])
for fpath in files:
    pngfile = os.path.join(ts_rootpath, fpath)
    ts_fnames = pd.concat([ts_fnames, pd.DataFrame({"fname": pngfile}, index = [0])])
ts_fnames.reset_index(drop = True, inplace = True)

In [25]:
ts_fnames.tail(1)['fname'].tolist()[0]

'../EgyptianHieroglyphDataset/MyTrainTest/test/070226_N35.png'

In [26]:
ts_fnames['symbol'] = ts_fnames['fname'].map(lambda x: re.sub(r".*\_(?P<label>.*)\.png", "\g<label>", x))

In [27]:
ts_fnames.tail(1)

Unnamed: 0,fname,symbol
511,../EgyptianHieroglyphDataset/MyTrainTest/test/...,N35


In [28]:
print(ts_fnames.shape)
ts_fnames = ts_fnames.merge(symbol_df, on = "symbol", how = "left")
print(ts_fnames.shape)

(512, 2)
(512, 3)


In [29]:
ts_fnames.tail(6)

Unnamed: 0,fname,symbol,symbol_num
506,../EgyptianHieroglyphDataset/MyTrainTest/test/...,I9,68
507,../EgyptianHieroglyphDataset/MyTrainTest/test/...,O31,109
508,../EgyptianHieroglyphDataset/MyTrainTest/test/...,D21,10
509,../EgyptianHieroglyphDataset/MyTrainTest/test/...,N35,100
510,../EgyptianHieroglyphDataset/MyTrainTest/test/...,I9,68
511,../EgyptianHieroglyphDataset/MyTrainTest/test/...,N35,100


In [30]:
with open(os.path.join(rootpath, "ts_filelist.pkl"), "wb") as picklefile:
    pickle.dump(ts_fnames, picklefile)

In [31]:
print(ts_fnames.shape)

(512, 3)
