In [None]:
import numpy as np
from Utils import *
import pandas as pd
from ast import literal_eval

Save big segment df as one csv
- segment index
- tablet cdli number
- assignment of view to the segments (e.g. obverse, reverse)
- collection name
- bbox annotation of the tablet segments in the composite image
- estimated scale factor that aims for an average sign height of about 128 image pixels
- assigned (whether there are signs?)

In [None]:
save_as_seg = "../data/segments/all_tablet_segments.csv"
top_dir_seg = "../data/segments/"

big_seg_df = read_in_csv(top_dir=top_dir_seg)
big_seg_df.to_csv(save_as_seg, index=False)

Save big bbox annotation df as one csv
- segment index
- tablet cdli number
- assignment of view to the segments (e.g. obverse, reverse)
- collection name
- sign label according to MZL
- sign label for machine learning according to mapping in label.json (don't have this file)
- bbox annotation of the tablet segments in the composite image
- bounding box coordinates relative to segment bbox and full composite image

In [None]:
save_as_ann = "../data/annotations/all_bbox_annotations.csv"
top_dir_ann = "../data/annotations/"
search_term_bbox = "bbox"

big_bbox_df = read_in_csv(top_dir=top_dir_ann, search_term=search_term_bbox)
big_bbox_df.to_csv(save_as_ann, index=False)

Remove duplicate entries in all_bbox_annotations

In [None]:
all_ann_path = "../data/annotations/all_bbox_annotations.csv"
all_ann_df = pd.read_csv(all_ann_path)
all_ann_df = all_ann_df.drop_duplicates()
all_ann_df.to_csv(all_ann_path, index=False)

Save big line annotation df as one csv
- line index identifies the points belonging to a line (lines can consist of one or several connected segments)
- lines that are defined by the start and end points of segments with (x,y) coords
- segment index
- tablet cdli number

In [None]:
save_as_line = "../data/annotations/all_line_annotations.csv"
top_dir_line = "../data/annotations/"
search_term_line = "line"

big_line_df = read_in_csv(top_dir=top_dir_line, search_term=search_term_line)
big_line_df.to_csv(save_as_line, index=False)

Save big transliteration df as one csv
- segment index
- tablet cdli number
- assignment of view to the segments (e.g. obverse, reverse)
- line by line cuneified transliteration (additional information available in a translation was removed)
- entries are indexed by line and position in line
- sign label for machine learning according to mapping in label.json
- sign label according to MZL
- status (?)

In [None]:
save_as_trans = "../data/transliterations/all_transliterations.csv"
top_dir_trans = "../data/transliterations/"

big_trans_df = read_in_csv(top_dir=top_dir_trans)
big_trans_df.to_csv(save_as_trans, index=False)

Check quantities

In [None]:
top_img_dir = "../data/images/all_images"
bbox_df_path = "../data/annotations/all_bbox_annotations.csv"
line_df_path = "../data/annotations/all_line_annotations.csv"

img_name_list = get_filepaths(dir_name=top_img_dir)
bbox_df = pd.read_csv(bbox_df_path)
line_df = pd.read_csv(line_df_path)

print(f"The number of images is: {len(img_name_list)}") # 2,190
print(f"The number of unique tablets in bbox is: {bbox_df['tablet_CDLI'].nunique()}")   # 81
print(f"The number of unique tablets in line is: {line_df['tablet_CDLI'].nunique()}")   # 36

Check which tablets in bbox are not in segments

In [None]:
all_segs_path = "../data/segments/all_tablet_segments.csv"
bbox_df_path = "../data/annotations/all_bbox_annotations.csv"

bbox_df = pd.read_csv(bbox_df_path)
all_segs_df = pd.read_csv(all_segs_path)

bool_mask = ~bbox_df['tablet_CDLI'].isin(all_segs_df['tablet_CDLI'])
extra_tablets = bbox_df[bool_mask]

Remove Unassigned entries from all tablet segments

In [None]:
all_segs_path = "../data/segments/all_tablet_segments.csv"
segs_df = pd.read_csv(all_segs_path)
segs_df = segs_df[segs_df.assigned]
segs_df.to_csv(all_segs_path, index=False)


Remove Annotations with no bounding box (segment is -1)

In [None]:
all_annotations_path = "../data/annotations/all_bbox_annotations.csv"
annotations_df = pd.read_csv(all_annotations_path)
annotations_df = annotations_df[annotations_df.segm_idx != -1]
annotations_df.to_csv(all_annotations_path, index=False)

Copy images from bbox csv into different folder

In [None]:
bbox_df_path = "../data/annotations/all_bbox_annotations.csv"
image_dir = "../data/images/all_images"
save_dir = "../data/images/viable_images"

bbox_df = pd.read_csv(bbox_df_path)
unique_img_names = bbox_df['tablet_CDLI'].unique().tolist()

copy_images(top_dir=image_dir, og_img_name_list=unique_img_names, save_dir=save_dir)


Save Segments and bbox annotations

In [None]:
# Segmentations df
seg_df_path = "../data/segments/all_tablet_segments.csv"
seg_df = pd.read_csv(seg_df_path)
seg_df.bbox = seg_df.bbox.apply(literal_eval).apply(np.array)

# bbox Annotations df
bbox_df_path = "../data/annotations/all_bbox_annotations.csv"
bbox_df = pd.read_csv(bbox_df_path)
bbox_df.relative_bbox = bbox_df.relative_bbox.apply(literal_eval).apply(np.array)

# Full images
img_dir = "../data/images/viable_images"
filepaths = get_filepaths(dir_name=img_dir)

# Save directories
seg_save_path = "../data/images/segments"
annot_save_path = "../data/annotations/segment_bbox_annotations"
save_dirs = [seg_save_path, annot_save_path]

# Call function
save_segments_and_bboxes(img_top_dir=img_dir, og_seg_df=seg_df, og_bbox_df=bbox_df, save_dirs=save_dirs)


Remove tablets from all_bbox_annotations that cannot be used

In [None]:
all_bbox_path = "../data/annotations/all_bbox_annotations.csv"
seg_dir_path = "../data/images/segments"

seg_img_names = get_filepaths(dir_name=seg_dir_path)
df = pd.read_csv(all_bbox_path)

df = df[(df.tablet_CDLI.str.cat(df.segm_idx.astype(str), sep='_') + '.jpg').isin(seg_img_names)]
df.to_csv(all_bbox_path, index=False)