### Data Collection

In [16]:
import tarfile
import os

def extract_files(filename, extract_path):
    # Check if extraction is necessary
    if os.path.exists(extract_path):
        print(f"Files already extracted in {extract_path}. Skipping extraction.")
        return

    # Check if the file is a tar archive
    if filename.endswith('.tar') or filename.endswith('.tgz'):
        with tarfile.open(filename, 'r:*') as tar:
            tar.extractall(extract_path)
            print(f"Extracted {filename} to {extract_path}")
    else:
        print("File is not a tar or tgz archive.")

if __name__ == '__main__':
    # Extract words.tgz
    extract_files('data/words.tgz', 'data/words')

    # Extract xml.tar
    extract_files('data/xml.tar', 'data/xml')

Files already extracted in data/words. Skipping extraction.
Files already extracted in data/xml. Skipping extraction.


In [17]:
import os
import pandas as pd
import xml.etree.ElementTree as ET

# Directory containing XML files
directory = 'data/xml'

# Initialize an empty list to hold the word data
data = []

# Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.xml'):
        # Construct the full file path
        file_path = os.path.join(directory, filename)
        
        # Parse the XML
        tree = ET.parse(file_path)
        root = tree.getroot()
        
        # Extract word data
        for word in root.findall('.//word'):
            word_id = word.get('id')
            text = word.get('text')
            tag = word.get('tag')
            components = [(int(cmp.get('x')), int(cmp.get('y')), int(cmp.get('width')), int(cmp.get('height'))) for cmp in word.findall('.//cmp')]
            
            # Append the data to our list
            data.append([word_id, text, tag, components])

# Create a DataFrame from the list
df = pd.DataFrame(data, columns=['WordID', 'Text', 'Tag', 'Components'])


In [19]:
df

Unnamed: 0,WordID,Text,Tag,Components
0,d06-050-00-00,Nor,CC,"[(354, 752, 43, 67), (399, 773, 50, 36)]"
1,d06-050-00-01,is,BEZ,"[(495, 780, 7, 35), (502, 766, 14, 51)]"
2,d06-050-00-02,she,PP3A,"[(553, 776, 33, 33), (588, 741, 54, 68)]"
3,d06-050-00-03,necessarily,RB,"[(697, 771, 50, 39), (759, 764, 82, 44), (849,..."
4,d06-050-00-04,being,BEG,"[(1027, 750, 64, 62), (1093, 781, 21, 26), (10..."
...,...,...,...,...
115315,g06-018m-07-06,of,INO,"[(1364, 1923, 72, 70)]"
115316,g06-018m-07-07,a,AT,"[(1451, 1956, 35, 29)]"
115317,g06-018m-07-08,fire,NN,"[(1541, 1916, 142, 88), (1623, 1924, 11, 7)]"
115318,g06-018m-07-09,...,...,"[(1712, 1978, 6, 8), (1734, 1978, 7, 10)]"


In [None]:
import cv2
import numpy as np

def load_and_preprocess_image(image_path, components, target_size=(64, 64)):
    # Load the image in grayscale
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    
    # Initialize a list to hold the processed components
    processed_components = []
    
    for (x, y, width, height) in components:
        # Crop the image to the component's bounding box
        cropped = image[y:y+height, x:x+width]
        
        # Resize the image to the target size
        resized = cv2.resize(cropped, target_size)
        
        # Normalize pixel values to be between 0 and 1
        normalized = resized / 255.0
        
        # Append the processed image to the list
        processed_components.append(normalized)
    
    # Combine the processed components into one array (if necessary)
    # Depending on your model, you might concatenate, stack, or average them
    # For simplicity, we'll stack them here
    combined = np.stack(processed_components, axis=0)
    
    return combined

In [25]:
import pandas as pd

# Path to your text file
file_path = 'data/words.txt'

# Initialize an empty list to hold the parsed data
data = []

# Open and read the file line by line
with open(file_path, 'r') as file:
    for line in file:
        # Skip lines that don't contain the data (like comments)
        if line.startswith('#') or not line.strip():
            continue

        # Split the line into parts
        parts = line.split()

        # Extract the relevant information
        word_id = parts[0]
        segmentation_result = parts[1]
        graylevel = int(parts[2])
        #num_components = int(parts[3])
        bounding_box = tuple(map(int, parts[3:7]))
        grammatical_tag = parts[7]
        transcription = ' '.join(parts[8:])

        # Append the data to the list
        data.append([word_id, segmentation_result, graylevel, bounding_box, grammatical_tag, transcription])

# Create a DataFrame from the list
df = pd.DataFrame(data, columns=['WordID', 'SegmentationResult', 'GrayLevel', 'BoundingBox', 'GrammaticalTag', 'Transcription'])

# Display the first few rows of the DataFrame
df.head()


Unnamed: 0,WordID,SegmentationResult,GrayLevel,BoundingBox,GrammaticalTag,Transcription
0,a01-000u-00-00,ok,154,"(408, 768, 27, 51)",AT,A
1,a01-000u-00-01,ok,154,"(507, 766, 213, 48)",NN,MOVE
2,a01-000u-00-02,ok,154,"(796, 764, 70, 50)",TO,to
3,a01-000u-00-03,ok,154,"(919, 757, 166, 78)",VB,stop
4,a01-000u-00-04,ok,154,"(1185, 754, 126, 61)",NPT,Mr.
