In [None]:
from processing.functions import main as main_v1
from annotation import main as main_v2

In [57]:
from file_manager import MusicFileManager
from annotation import main, xls_to_df
import pandas as pd
import os
import csv

In [58]:
# --- Example Setup ---
data_dir = "data/"  # <--- IMPORTANT: SET THIS
output_annotations_csv = "pseudo_annotations2.csv"
all_annotations_for_csv = []  # This will hold all annotations from all files

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

files_to_process = [f for f in os.listdir(data_dir) if f.lower().endswith(('.xls', '.xlsx'))]
if not files_to_process:
    print(f"No Excel files found in {data_dir}. Please add some files or check the path.")
    # exit() # Optional: exit if no files

for filename in files_to_process:
    print(f"\nProcessing file: {filename}")
    df_excel, sheet_name = xls_to_df(filename, base_dir=data_dir)

    if df_excel is not None and sheet_name is not None:
        # Reset annotations list for each file if you want separate annotation files per excel
        # Or use a global list like all_annotations_for_csv for one big file
        # For this example, we use the global all_annotations_for_csv

        print(f"Successfully read sheet '{sheet_name}' from {filename}")
        structured_data = main(
            df_excel,
            # start_row_index=155,
            # end_row_index=200,
            filename_for_ann=filename,
            sheetname_for_ann=sheet_name,
            annotations_list=all_annotations_for_csv,
            debug=True,  # Enable debug prints
            debug_location=True,
            debug_header=False
        )
        print(f"\n--- Structured Data for {filename} ---")
        print(structured_data.head())
        print("...")
    else:
        print(f"Could not process DataFrame from {filename}")


Processing file: RDO No. 1 - Laoag City, Ilocos Norte.xls
Successfully read sheet 'Sheet 9 (DO 047-2023)' from RDO No. 1 - Laoag City, Ilocos Norte.xls

Running find_location_components starting at df index 0

find_location_components: Processing df_row 0 (offset 0/2)

find_location_components: Processing df_row 1 (offset 1/2)

find_location_components: Processing df_row 2 (offset 2/2)

Running find_location_components starting at df index 1

find_location_components: Processing df_row 1 (offset 0/2)

find_location_components: Processing df_row 2 (offset 1/2)

find_location_components: Processing df_row 3 (offset 2/2)

Running find_location_components starting at df index 2

find_location_components: Processing df_row 2 (offset 0/2)

find_location_components: Processing df_row 3 (offset 1/2)

find_location_components: Processing df_row 4 (offset 2/2)

Running find_location_components starting at df index 3

find_location_components: Processing df_row 3 (offset 0/2)

find_location_comp

In [59]:
LABEL_LOC_P = "LOC_P"
LABEL_LOC_C = "LOC_C"
LABEL_LOC_B = "LOC_B"
LABEL_HDR = "HDR"
LABEL_DATA = "DATA"
LABEL_BLANK = "BLANK"
LABEL_OTHER = "OTHER"
LABEL_TITLE = "TITLE"
LABEL_NOTE = "NOTE"

In [60]:
# --- Write all collected annotations to a single CSV file ---
if all_annotations_for_csv:
    # Deduplicate annotations (important if rows could be added multiple times by different logic paths)
    # A simple way is to convert to list of tuples and then to set and back, based on unique (file, sheet, row_index)
    seen_annotations = set()
    final_unique_annotations = []
    for ann in all_annotations_for_csv:
        # Create a unique key for each annotation entry
        # Using only row_index for uniqueness *within a sheet*
        # For global uniqueness, use (filename, sheetname, row_index)
        ann_key = (ann["filename"], ann["sheetname"], ann["row_index"])
        if ann_key not in seen_annotations:
            final_unique_annotations.append(ann)
            seen_annotations.add(ann_key)
        else:  # If seen, we might want to update if the new label is more specific, e.g. OTHER -> DATA
            # This requires more complex logic, for now, first one wins or last one based on order.
            # Let's make it so that more specific labels (not OTHER/BLANK) can overwrite.
            # Find existing and update if new is better
            for i, existing_ann in enumerate(final_unique_annotations):
                if (existing_ann["filename"], existing_ann["sheetname"], existing_ann["row_index"]) == ann_key:
                    # Prioritize more specific labels over generic ones
                    priority = {LABEL_LOC_P: 5, LABEL_LOC_C: 5, LABEL_LOC_B: 5, LABEL_HDR: 4, LABEL_DATA: 3,
                                LABEL_TITLE: 2, LABEL_NOTE: 2, LABEL_BLANK: 1, LABEL_OTHER: 0}
                    if priority.get(ann["label"], -1) > priority.get(existing_ann["label"], -1):
                        final_unique_annotations[i] = ann  # Update with more specific label
                    break

    # Sort by filename, sheetname, then row_index for consistent output
    final_unique_annotations.sort(key=lambda x: (x["filename"], x["sheetname"], x["row_index"]))

    print(f"\nWriting {len(final_unique_annotations)} pseudo-annotations to {output_annotations_csv}")
    annotation_df = pd.DataFrame(final_unique_annotations)
    annotation_df.to_csv(output_annotations_csv, index=False, quoting=csv.QUOTE_ALL)
    print("Annotation CSV created successfully.")
else:
    print("No annotations were generated.")


Writing 43099 pseudo-annotations to pseudo_annotations2.csv
Annotation CSV created successfully.


In [107]:
import pandas as pd
import json
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from typing import List, Dict, Tuple

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
# pip install pytorch-crf
from torchcrf import CRF

# --- Configuration & Constants ---
# (Should be similar to your Keras setup)
KEYWORDS_FOR_STARTS_WITH = ["Province", "City", "Municipality", "Barangay", "Zone", "Street"]
ZV_PATTERN_REGEX = re.compile(
    r"\d+(?:ST|ND|RD|TH)\s+(?:REVISION|Rev)(?:.*Z\.?V\.?.*SQ.*M\.?)?|"
    r"(?:\d+(?:ST|ND|RD|TH)\s+REVISION|Rev\s+ZV\s+/?.*SQ\.?\s*M\.?)|"
    r"(?:Z|2)\.?V\.?.*SQ.*M\.?|FINAL",
    re.IGNORECASE
)
VOCAB_SIZE = 20000  # Max vocabulary size for row texts
EMBEDDING_DIM = 128  # Embedding dimension for row texts
TEXT_SEQ_OUTPUT_LEN = 50  # Max words per row's concatenated text (for truncating/padding individual row texts)
LSTM_HIDDEN_DIM = 128  # Hidden dim for the main BiLSTM processing rows
BATCH_SIZE = 4  # Small batch size for demonstration
EPOCHS = 50  # Small number of epochs for demonstration
LEARNING_RATE = 1e-3
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PAD_TOKEN = "<pad>"
UNK_TOKEN = "<unk>"


In [62]:

# --- Feature Engineering (Identical to your Keras version) ---
def is_numeric_str(s: str) -> bool:
    if not s: return False
    try:
        float(s)
        return True
    except ValueError:
        return False


def extract_single_row_features(raw_cells_list: list[str]) -> dict:
    scalar_features_dict = {}
    non_whitespace_cells_texts = [str(cell_text) for cell_text in raw_cells_list if
                                  cell_text is not None and str(cell_text).strip() != ""]
    concatenated_text = " ".join(non_whitespace_cells_texts)
    scalar_features_dict['num_cells_in_row'] = float(len(raw_cells_list))
    first_non_null_idx = -1
    for i, cell_text in enumerate(raw_cells_list):
        if cell_text is not None and str(cell_text).strip() != "":
            first_non_null_idx = i
            break
    scalar_features_dict['first_non_null_column'] = float(first_non_null_idx)
    last_non_null_idx = -1
    for i in range(len(raw_cells_list) - 1, -1, -1):
        cell_text = raw_cells_list[i]
        if cell_text is not None and str(cell_text).strip() != "":
            last_non_null_idx = i
            break
    scalar_features_dict['last_non_null_column'] = float(last_non_null_idx)
    scalar_features_dict['num_non_empty_cells'] = float(len(non_whitespace_cells_texts))
    if scalar_features_dict['num_cells_in_row'] > 0:
        scalar_features_dict['ratio_non_empty_cells'] = scalar_features_dict['num_non_empty_cells'] / \
                                                        scalar_features_dict['num_cells_in_row']
    else:
        scalar_features_dict['ratio_non_empty_cells'] = 0.0
    numeric_cell_count = sum(1 for text in non_whitespace_cells_texts if is_numeric_str(text))
    scalar_features_dict['num_numeric_cells'] = float(numeric_cell_count)
    if scalar_features_dict['num_non_empty_cells'] > 0:
        scalar_features_dict['ratio_numeric_cells'] = numeric_cell_count / scalar_features_dict['num_non_empty_cells']
    else:
        scalar_features_dict['ratio_numeric_cells'] = 0.0
    scalar_features_dict['is_row_empty_or_whitespace_only'] = 1.0 if scalar_features_dict[
                                                                         'num_non_empty_cells'] == 0 else 0.0
    first_cell_raw = str(raw_cells_list[0]) if raw_cells_list and raw_cells_list[0] is not None else ""
    for keyword in KEYWORDS_FOR_STARTS_WITH:
        scalar_features_dict[f'starts_with_keyword_{keyword.lower().replace(" ", "_")}'] = \
            1.0 if first_cell_raw.lower().startswith(keyword.lower()) else 0.0
    raw_concatenated_for_zv = " ".join(str(cell) for cell in raw_cells_list if cell is not None)
    scalar_features_dict['contains_keyword_zv'] = 1.0 if bool(ZV_PATTERN_REGEX.search(raw_concatenated_for_zv)) else 0.0
    feature_order = [
                        'num_cells_in_row', 'first_non_null_column', 'last_non_null_column',
                        'num_non_empty_cells', 'ratio_non_empty_cells', 'num_numeric_cells',
                        'ratio_numeric_cells', 'is_row_empty_or_whitespace_only', 'contains_keyword_zv'
                    ] + [f'starts_with_keyword_{kw.lower().replace(" ", "_")}' for kw in KEYWORDS_FOR_STARTS_WITH]
    numeric_features_list = [scalar_features_dict.get(fname, 0.0) for fname in feature_order]
    return {
        'concatenated_text_clean': concatenated_text,
        'numeric_features': np.array(numeric_features_list, dtype=np.float32)
    }

In [63]:


# --- PyTorch Text Vectorization Helper ---
class TextVectorizer:
    def __init__(self, max_tokens=VOCAB_SIZE, output_sequence_length=TEXT_SEQ_OUTPUT_LEN):
        self.max_tokens = max_tokens
        self.output_sequence_length = output_sequence_length
        self.vocab = {}
        self.token_to_idx = {}
        self.idx_to_token = {}
        self.pad_token_id = 0
        self.unk_token_id = 1

    def fit_on_texts(self, texts: List[str]):
        word_counts = Counter()
        for text in texts:
            word_counts.update(text.lower().split())

        # Keep most common words, reserve for pad and unk
        common_words = [word for word, count in word_counts.most_common(self.max_tokens - 2)]
        self.token_to_idx = {PAD_TOKEN: self.pad_token_id, UNK_TOKEN: self.unk_token_id}
        for i, token in enumerate(common_words):
            self.token_to_idx[token] = i + 2  # Start after pad and unk
        self.idx_to_token = {idx: token for token, idx in self.token_to_idx.items()}
        self.vocab_size = len(self.token_to_idx)
        print(f"Vocabulary size: {self.vocab_size}")

    def texts_to_sequences(self, texts: List[str]) -> List[List[int]]:
        sequences = []
        for text in texts:
            tokens = text.lower().split()
            seq = [self.token_to_idx.get(token, self.unk_token_id) for token in tokens]
            # Pad or truncate individual row text sequence
            if len(seq) < self.output_sequence_length:
                seq.extend([self.pad_token_id] * (self.output_sequence_length - len(seq)))
            else:
                seq = seq[:self.output_sequence_length]
            sequences.append(seq)
        return sequences


In [76]:

# --- Data Loading and Preprocessing ---
print("Loading annotations...")
try:
    df_annotations = pd.read_csv("pseudo_annotations2.csv")
except FileNotFoundError:
    print("Error: annotations.csv not found.")
    exit()

df_annotations['raw_cells_list'] = df_annotations['raw_text'].apply(
    lambda x: json.loads(x) if pd.notna(x) else [])

print("Extracting features for each row...")
all_row_data = []
for index, row in df_annotations.iterrows():
    # Corrected to use 'raw_cells_list' as per previous discussion
    features = extract_single_row_features(row['raw_cells_list'])
    all_row_data.append({
        'text': features['concatenated_text_clean'],
        'numerics': features['numeric_features'],
        'label': row['label'],
        'filename': row['filename'],
        'sheetname': row['sheetname']
    })
df_processed_rows = pd.DataFrame(all_row_data)

Loading annotations...
Extracting features for each row...


In [78]:
df_processed_rows.sample(4)

Unnamed: 0,text,numerics,label,filename,sheetname
23213,"Revenue District Office No. 1 - LAOAG CITY, ILOCOS NORTE","[4.0, 0.0, 0.0, 1.0, 0.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",OTHER,"RDO No. 1 - Laoag City, Ilocos Norte.xls",Sheet 9 (DO 047-2023)
30176,SAN JOSE J WRIGHT-SAN MIGUEL RR 44000,"[6.0, 0.0, 5.0, 4.0, 0.6666667, 1.0, 0.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",DATA,RDO No. 42 - San Juan City.xls,Sheet 8 (DO 022-2023)
8308,A12 233,"[4.0, 2.0, 3.0, 2.0, 0.5, 1.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",DATA,"RDO No. 1 - Laoag City, Ilocos Norte.xls",Sheet 9 (DO 047-2023)
28792,ZONE/BARANGAY HAGDANG BATO LIBIS Effectivity Date 2022-09-22 00:00:00,"[4.0, 0.0, 3.0, 4.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]",LOC_B,RDO No. 41 - Mandaluyong City.xls,Sheet 9 (DO 059-2022


In [79]:

# --- Group rows into sequences (sheets) ---
print("Grouping rows into sequences (sheets)...")
sheet_sequences_text_raw = []  # List of lists of strings (row texts within a sheet)
sheet_sequences_numerics = []  # List of np.arrays (numeric features for rows within a sheet)
sheet_sequences_labels_str = []  # List of lists of strings (labels for rows within a sheet)

Grouping rows into sequences (sheets)...


In [80]:

for (fname, sname), group in df_processed_rows.groupby(['filename', 'sheetname']):
    sheet_sequences_text_raw.append(group['text'].tolist())
    sheet_sequences_numerics.append(np.stack(group['numerics'].values))
    sheet_sequences_labels_str.append(group['label'].tolist())

if not sheet_sequences_text_raw:
    print("No sequences found after grouping. Exiting.")
    exit()

In [81]:

# --- Label Encoding ---
print("Encoding labels...")
all_unique_labels = sorted(list(set(label for seq in sheet_sequences_labels_str for label in seq)))
label_encoder = LabelEncoder()
label_encoder.fit(all_unique_labels)
n_classes = len(label_encoder.classes_)
# PyTorch CRF typically doesn't need a separate padding label ID in n_classes itself,
# as padding is handled by a mask.
# However, for padding sequences of labels to the same length for batching,
# we might use a value outside 0..n_classes-1, e.g., -1 or n_classes.
# The CRF mask will then identify these. Let's use n_classes as padding value for labels for now.
LABEL_PAD_ID = n_classes  # or -1, but ensure consistency
print(f"Found {n_classes} unique classes: {label_encoder.classes_}. Using {LABEL_PAD_ID} as label padding ID.")

Encoding labels...
Found 7 unique classes: ['BLANK' 'DATA' 'HDR' 'LOC_B' 'LOC_C' 'LOC_P' 'OTHER']. Using 7 as label padding ID.


In [82]:

sheet_sequences_labels_encoded = [
    label_encoder.transform(seq) for seq in sheet_sequences_labels_str
]

In [83]:

# --- Text Vectorization ---
print("Vectorizing text data...")
all_texts_for_vocab = [text for seq in sheet_sequences_text_raw for text in seq]
text_vectorizer = TextVectorizer(max_tokens=VOCAB_SIZE, output_sequence_length=TEXT_SEQ_OUTPUT_LEN)
text_vectorizer.fit_on_texts(all_texts_for_vocab)

# Convert sheet texts to sequences of token IDs
sheet_sequences_text_ids = []
for sheet_texts in sheet_sequences_text_raw:
    sheet_sequences_text_ids.append(
        np.array(text_vectorizer.texts_to_sequences(sheet_texts), dtype=np.int64)
    )

Vectorizing text data...
Vocabulary size: 10202


In [84]:

# --- Determine n_numeric_features ---
_feature_order_for_len_calc = [
                                  'num_cells_in_row', 'first_non_null_column', 'last_non_null_column',
                                  'num_non_empty_cells', 'ratio_non_empty_cells', 'num_numeric_cells',
                                  'ratio_numeric_cells', 'is_row_empty_or_whitespace_only', 'contains_keyword_zv'
                              ] + [f'starts_with_keyword_{kw.lower().replace(" ", "_")}' for kw in
                                   KEYWORDS_FOR_STARTS_WITH]
n_numeric_features = len(_feature_order_for_len_calc)
print(f"Number of scalar numeric features per row: {n_numeric_features}")

Number of scalar numeric features per row: 15


In [85]:

# --- Data Splitting (Sheet-level) ---
print("Splitting data into train, validation, test sets...")
indices = np.arange(len(sheet_sequences_text_ids))  # Number of sheets
train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42, shuffle=True)
train_indices, val_indices = train_test_split(train_indices, test_size=0.15, random_state=42, shuffle=True)

Splitting data into train, validation, test sets...


In [86]:


def get_split_data_pytorch(selected_indices):
    split_texts = [sheet_sequences_text_ids[i] for i in selected_indices]
    split_numerics = [torch.tensor(sheet_sequences_numerics[i], dtype=torch.float32) for i in selected_indices]
    split_labels = [torch.tensor(sheet_sequences_labels_encoded[i], dtype=torch.long) for i in selected_indices]
    # For CRF, we also need original lengths to create masks
    split_lengths = [len(seq) for seq in split_labels]  # Original number of rows in each sheet
    return split_texts, split_numerics, split_labels, split_lengths


In [87]:

train_texts_ids, train_numerics, train_labels, train_lengths = get_split_data_pytorch(train_indices)
val_texts_ids, val_numerics, val_labels, val_lengths = get_split_data_pytorch(val_indices)
test_texts_ids, test_numerics, test_labels, test_lengths = get_split_data_pytorch(test_indices)

print(
    f"Train sequences: {len(train_texts_ids)}, Val sequences: {len(val_texts_ids)}, Test sequences: {len(test_texts_ids)}")

Train sequences: 9, Val sequences: 2, Test sequences: 3


# Split by row

In [15]:
sheet_sequences_text_ids[0]

array([[1573,    5,   82, ...,    0,    0,    0],
       [1277,    5, 1575, ...,    0,    0,    0],
       [1278,    0,    0, ...,    0,    0,    0],
       ...,
       [4086, 4087, 2191, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0],
       [4089, 4090, 4091, ...,    0,    0,    0]], dtype=int64)

In [88]:
# # Assuming sheet_sequences_text_ids[0], sheet_sequences_numerics[0], etc. are your data
# all_text_ids = sheet_sequences_text_ids[0]
# all_numerics = sheet_sequences_numerics[0]
# all_labels = sheet_sequences_labels_encoded[0]
# 
# # Create row indices
# row_indices = np.arange(len(all_labels))
# 
# # Split rows into train/test/val
# train_row_indices, test_row_indices = train_test_split(row_indices, test_size=0.2, random_state=42)
# train_row_indices, val_row_indices = train_test_split(train_row_indices, test_size=0.15, random_state=42)
# 
# # Extract data by row indices for train set
# train_texts_ids = [all_text_ids[i] for i in train_row_indices]
# train_numerics = torch.tensor([all_numerics[i] for i in train_row_indices], dtype=torch.float32)
# train_labels = torch.tensor([all_labels[i] for i in train_row_indices], dtype=torch.long)
# 
# # Since we're now working with individual rows, each "sample" is a single row
# # If your model expects a length for each row (e.g., the number of tokens in each row's text),
# # you should extract that length from your data
# train_lengths = [len(all_text_ids[i]) for i in train_row_indices]  # Length of text for each row
# 
# # Extract data by row indices for validation set
# val_texts_ids = [all_text_ids[i] for i in val_row_indices]
# val_numerics = torch.tensor([all_numerics[i] for i in val_row_indices], dtype=torch.float32)
# val_labels = torch.tensor([all_labels[i] for i in val_row_indices], dtype=torch.long)
# val_lengths = [len(all_text_ids[i]) for i in val_row_indices]
# 
# # Extract data by row indices for test set
# test_texts_ids = [all_text_ids[i] for i in test_row_indices]
# test_numerics = torch.tensor([all_numerics[i] for i in test_row_indices], dtype=torch.float32)
# test_labels = torch.tensor([all_labels[i] for i in test_row_indices], dtype=torch.long)
# test_lengths = [len(all_text_ids[i]) for i in test_row_indices]
# 
# # Print split sizes
# print(f"Train rows: {len(train_lengths)}, Val rows: {len(val_lengths)}, Test rows: {len(test_lengths)}")

In [89]:


# --- PyTorch Dataset and DataLoader ---
class SheetDataset(Dataset):
    def __init__(self, texts_ids_list, numerics_list, labels_list, lengths_list):
        self.texts_ids_list = texts_ids_list
        self.numerics_list = numerics_list
        self.labels_list = labels_list
        self.lengths_list = lengths_list  # original sequence lengths

    def __len__(self):
        return len(self.texts_ids_list)

    def __getitem__(self, idx):
        return {
            "texts": torch.tensor(self.texts_ids_list[idx], dtype=torch.long),  # (seq_len_sheet, text_seq_output_len)
            "numerics": self.numerics_list[idx],  # (seq_len_sheet, n_numeric_features)
            "labels": self.labels_list[idx],  # (seq_len_sheet,)
            "length": self.lengths_list[idx]  # scalar, original number of rows in sheet
        }


In [90]:

def collate_fn(batch):
    # Pad sequences within the batch to the max length in that batch
    texts_batch = [item['texts'] for item in batch]
    numerics_batch = [item['numerics'] for item in batch]
    labels_batch = [item['labels'] for item in batch]
    lengths_batch = torch.tensor([item['length'] for item in batch], dtype=torch.long)

    # Pad each type of sequence
    # texts: (batch_size, max_sheet_len_in_batch, text_seq_output_len)
    padded_texts = pad_sequence(texts_batch, batch_first=True, padding_value=text_vectorizer.pad_token_id)
    # numerics: (batch_size, max_sheet_len_in_batch, n_numeric_features)
    padded_numerics = pad_sequence(numerics_batch, batch_first=True, padding_value=0.0)
    # labels: (batch_size, max_sheet_len_in_batch)
    padded_labels = pad_sequence(labels_batch, batch_first=True, padding_value=LABEL_PAD_ID)

    return {
        "texts": padded_texts,
        "numerics": padded_numerics,
        "labels": padded_labels,
        "lengths": lengths_batch
    }

In [91]:
train_labels

[tensor([6, 6, 6,  ..., 4, 6, 6]),
 tensor([6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 3, 3, 6, 6, 6, 6, 4,
         6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
         6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
         6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 3, 6,
         6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 5, 6, 4, 6, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 6, 0, 6, 0, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
         0, 6, 0, 6, 5, 6, 4, 6, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [92]:


train_dataset = SheetDataset(train_texts_ids, train_numerics, train_labels, train_lengths)
val_dataset = SheetDataset(val_texts_ids, val_numerics, val_labels, val_lengths)
test_dataset = SheetDataset(test_texts_ids, test_numerics, test_labels, test_lengths)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)


In [93]:

# --- PyTorch Model Definition ---
class TextFeatureExtractor(nn.Module):
    def __init__(self, vocab_size, embedding_dim, text_seq_output_len):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=text_vectorizer.pad_token_id)
        # GlobalAveragePooling1D equivalent for (batch * sheet_len, text_seq_output_len, embedding_dim)
        # We will apply mean over the text_seq_output_len dimension
        # self.pool = nn.AdaptiveAvgPool1d(1) # Could also work if input is (N, C, L)

    def forward(self, text_input_per_row):
        # text_input_per_row shape: (N, text_seq_output_len) where N = batch_size * sheet_len
        embedded_text = self.embedding(text_input_per_row)  # (N, text_seq_output_len, embedding_dim)
        # Mask out padding tokens before averaging
        mask = (text_input_per_row != text_vectorizer.pad_token_id).unsqueeze(-1).float()  # (N, text_seq_output_len, 1)
        masked_embedded_text = embedded_text * mask
        summed_embeddings = masked_embedded_text.sum(dim=1)  # (N, embedding_dim)
        non_padding_counts = mask.sum(dim=1)  # (N, 1)
        non_padding_counts = non_padding_counts.clamp(min=1e-9)  # Avoid division by zero
        text_features = summed_embeddings / non_padding_counts  # (N, embedding_dim)
        return text_features

In [94]:


class NumericFeatureExtractor(nn.Module):
    def __init__(self, n_numeric_features):
        super().__init__()
        # Using LayerNorm as a simple normalization.
        # BatchNorm1d would need to be adapted on the training set's flattened numeric features.
        self.norm = nn.LayerNorm(n_numeric_features)

    def forward(self, numeric_input_per_row):
        # numeric_input_per_row shape: (N, n_numeric_features)
        return self.norm(numeric_input_per_row)


In [101]:

class RowClassifierSequenceModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, text_seq_output_len, # This param is for TextFeatureExtractor
                 n_numeric_features, lstm_hidden_dim, n_classes):
        super().__init__()
        # text_seq_output_len is used here for the sub-module
        self.text_branch = TextFeatureExtractor(vocab_size, embedding_dim, text_seq_output_len)
        self.numeric_branch = NumericFeatureExtractor(n_numeric_features)

        self.combined_feature_dim = embedding_dim + n_numeric_features
        self.bilstm = nn.LSTM(self.combined_feature_dim, lstm_hidden_dim,
                              bidirectional=True, batch_first=True)
        self.fc_to_crf = nn.Linear(lstm_hidden_dim * 2, n_classes)  # LSTM output to emission scores
        self.crf = CRF(n_classes, batch_first=True) # Assuming CRF is imported/defined

    def forward(self, sheet_texts_ids, sheet_numerics, sheet_lengths, targets=None):
        # sheet_texts_ids: (batch_size, max_sheet_len, actual_text_seq_len_in_this_batch_input)
        # sheet_numerics: (batch_size, max_sheet_len, n_numeric_features)
        # sheet_lengths: (batch_size,) original lengths of sheets in the batch

        # Correctly get all dimensions from the input tensor's shape
        batch_size, max_sheet_len, current_text_seq_len = sheet_texts_ids.shape

        # Reshape for row-wise processing
        # (batch_size * max_sheet_len, current_text_seq_len)
        texts_flat = sheet_texts_ids.view(-1, current_text_seq_len) # Use the dimension from the input

        # The n_numeric_features is implicitly handled by sheet_numerics.shape[2]
        # (batch_size * max_sheet_len, n_numeric_features)
        numerics_flat = sheet_numerics.view(-1, sheet_numerics.shape[2])


        processed_text_flat = self.text_branch(texts_flat)  # (batch*sheet_len, embedding_dim)
        processed_numerics_flat = self.numeric_branch(numerics_flat)  # (batch*sheet_len, n_numeric_features)

        # Reshape back to sequence
        # (batch_size, max_sheet_len, embedding_dim)
        processed_text_sequence = processed_text_flat.view(batch_size, max_sheet_len, -1)
        # (batch_size, max_sheet_len, n_numeric_features)
        processed_numeric_sequence = processed_numerics_flat.view(batch_size, max_sheet_len, -1)

        merged_row_features = torch.cat([processed_text_sequence, processed_numeric_sequence], dim=2)
        # (batch_size, max_sheet_len, embedding_dim + n_numeric_features)

        # Pack padded sequence for LSTM
        # sheet_lengths should be on CPU for pack_padded_sequence if using older PyTorch versions
        # For newer versions, it might handle GPU tensors directly. Let's keep .cpu() for wider compatibility.
        packed_input = pack_padded_sequence(merged_row_features, sheet_lengths.cpu(),
                                            batch_first=True, enforce_sorted=False)
        packed_output, _ = self.bilstm(packed_input)
        bilstm_output, _ = pad_packed_sequence(packed_output, batch_first=True, total_length=max_sheet_len)
        # (batch_size, max_sheet_len, lstm_hidden_dim * 2)

        emissions = self.fc_to_crf(bilstm_output)  # (batch_size, max_sheet_len, n_classes)

        # Create mask for CRF: True for non-padded elements
        # Mask shape should be (batch_size, max_sheet_len)
        mask = torch.arange(max_sheet_len, device=emissions.device).expand(len(sheet_lengths), max_sheet_len) < sheet_lengths.unsqueeze(1).to(emissions.device)
        # Ensure mask is on the same device as emissions

        if targets is not None:
            # CRF expects tags to be LongTensor, mask to be ByteTensor (or BoolTensor in newer PyTorch)
            # Ensure targets are also on the correct device
            current_targets = targets.to(emissions.device)
            safe_targets = current_targets.clone()
            safe_targets[current_targets == LABEL_PAD_ID] = 0 # Assuming 0 is always a valid class index.
                                                             # If your classes don't start at 0 or 0 has special meaning
                                                             # you might pick another valid class index. Typically 0 is fine.
            
            loss = -self.crf(emissions, safe_targets, mask=mask, reduction='mean')
            return loss
        else:
            # For inference
            decoded_sequence = self.crf.decode(emissions, mask=mask)  # List of lists of tag indices
            return decoded_sequence

In [102]:

# --- Model Initialization ---
print("Defining PyTorch model...")
model = RowClassifierSequenceModel(
    vocab_size=text_vectorizer.vocab_size,
    embedding_dim=EMBEDDING_DIM,
    text_seq_output_len=TEXT_SEQ_OUTPUT_LEN,
    n_numeric_features=n_numeric_features,
    lstm_hidden_dim=LSTM_HIDDEN_DIM,
    n_classes=n_classes
).to(DEVICE)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

Defining PyTorch model...


In [108]:

# --- Training Loop ---
print("Training model...")
for epoch in range(EPOCHS):
    model.train()
    total_train_loss = 0
    for i, batch in enumerate(train_loader):
        texts = batch['texts'].to(DEVICE)
        numerics = batch['numerics'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        lengths = batch['lengths'].to(DEVICE)  # lengths are already on device if collate_fn puts them there

        optimizer.zero_grad()
        loss = model(texts, numerics, lengths, targets=labels)

        if loss is not None:  # Should always be not None during training
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

        if (i + 1) % 10 == 0:  # Print every 10 batches
            print(f"Epoch [{epoch + 1}/{EPOCHS}], Batch [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}")

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch [{epoch + 1}/{EPOCHS}] - Training Loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            texts = batch['texts'].to(DEVICE)
            numerics = batch['numerics'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)
            lengths = batch['lengths'].to(DEVICE)

            loss = model(texts, numerics, lengths, targets=labels)
            if loss is not None:
                total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    print(f"Epoch [{epoch + 1}/{EPOCHS}] - Validation Loss: {avg_val_loss:.4f}")

print("Training complete.")

Training model...
Epoch [1/50] - Training Loss: 1212.3270
Epoch [1/50] - Validation Loss: 886.5743
Epoch [2/50] - Training Loss: 1344.8021
Epoch [2/50] - Validation Loss: 778.4840
Epoch [3/50] - Training Loss: 1329.7575
Epoch [3/50] - Validation Loss: 730.6879
Epoch [4/50] - Training Loss: 1054.1796
Epoch [4/50] - Validation Loss: 685.8781
Epoch [5/50] - Training Loss: 1150.1810
Epoch [5/50] - Validation Loss: 627.2464
Epoch [6/50] - Training Loss: 766.5842
Epoch [6/50] - Validation Loss: 587.9197
Epoch [7/50] - Training Loss: 1033.8388
Epoch [7/50] - Validation Loss: 555.3022
Epoch [8/50] - Training Loss: 965.2339
Epoch [8/50] - Validation Loss: 528.2240
Epoch [9/50] - Training Loss: 664.3570
Epoch [9/50] - Validation Loss: 501.0763
Epoch [10/50] - Training Loss: 739.6747
Epoch [10/50] - Validation Loss: 479.6715
Epoch [11/50] - Training Loss: 828.3966
Epoch [11/50] - Validation Loss: 462.4685
Epoch [12/50] - Training Loss: 680.5805
Epoch [12/50] - Validation Loss: 446.9899
Epoch [13/

In [109]:

# --- Inference/Prediction (Example on one batch from test set) ---
print("\nMaking predictions on a sample from test set...")
model.eval()
if len(test_dataset) > 0:
    # Get a sample batch
    sample_batch = next(iter(test_loader))
    texts_sample = sample_batch['texts'].to(DEVICE)
    numerics_sample = sample_batch['numerics'].to(DEVICE)
    labels_sample_true = sample_batch['labels']  # Keep on CPU for comparison
    lengths_sample = sample_batch['lengths'].to(DEVICE)

    with torch.no_grad():
        predicted_sequences_encoded = model(texts_sample, numerics_sample, lengths_sample)  # List of lists

    # Display predictions for the first sheet in the batch
    idx_to_show = 0
    if idx_to_show < len(predicted_sequences_encoded):
        pred_tags_for_sheet = predicted_sequences_encoded[idx_to_show]  # This is a list of tag indices
        true_tags_for_sheet = labels_sample_true[idx_to_show][
                              :lengths_sample[idx_to_show]].tolist()  # Get true labels up to original length

        # Find original text for this sample (more involved as DataLoader shuffles)
        # For simplicity, we'll just show predicted vs true tags
        # To get original text, you'd need to map back from test_loader to test_dataset indices or not shuffle test_loader.

        print(f"\nSample Prediction for sheet {idx_to_show} (length {lengths_sample[idx_to_show].item()}):")
        predicted_labels_str = label_encoder.inverse_transform(pred_tags_for_sheet)
        true_labels_str = label_encoder.inverse_transform(true_tags_for_sheet)

        for i in range(lengths_sample[idx_to_show].item()):
            # original_text_snippet = "..." # Would need to fetch original text for this row
            print(f"Row {i}: True='{true_labels_str[i]}', Predicted='{predicted_labels_str[i]}'")
    else:
        print("Sample index out of bounds for the batch.")

else:
    print("Test set is empty, cannot make predictions.")

# --- To save the model ---
# torch.save(model.state_dict(), "row_classifier_sequence_model.pth")
# print("PyTorch Model state_dict saved.")
# To save text vectorizer (vocab) and label encoder:
# import pickle
# with open('text_vectorizer.pkl', 'wb') as f:
# pickle.dump(text_vectorizer, f)
# with open('label_encoder.pkl', 'wb') as f:
# pickle.dump(label_encoder, f)
# print("Text vectorizer and label encoder saved.")

# To load:
# model_loaded = RowClassifierSequenceModel(...) # Instantiate model first
# model_loaded.load_state_dict(torch.load("row_classifier_sequence_model.pth"))
# model_loaded.to(DEVICE)
# model_loaded.eval()
# with open('text_vectorizer.pkl', 'rb') as f:
#     loaded_vectorizer = pickle.load(f)
# with open('label_encoder.pkl', 'rb') as f:
#     loaded_label_encoder = pickle.load(f)



Making predictions on a sample from test set...

Sample Prediction for sheet 0 (length 975):
Row 0: True='OTHER', Predicted='OTHER'
Row 1: True='OTHER', Predicted='OTHER'
Row 2: True='OTHER', Predicted='OTHER'
Row 3: True='OTHER', Predicted='OTHER'
Row 4: True='OTHER', Predicted='OTHER'
Row 5: True='OTHER', Predicted='OTHER'
Row 6: True='OTHER', Predicted='OTHER'
Row 7: True='OTHER', Predicted='OTHER'
Row 8: True='LOC_C', Predicted='OTHER'
Row 9: True='OTHER', Predicted='OTHER'
Row 10: True='OTHER', Predicted='OTHER'
Row 11: True='OTHER', Predicted='OTHER'
Row 12: True='OTHER', Predicted='OTHER'
Row 13: True='OTHER', Predicted='OTHER'
Row 14: True='OTHER', Predicted='OTHER'
Row 15: True='OTHER', Predicted='OTHER'
Row 16: True='LOC_B', Predicted='LOC_B'
Row 17: True='OTHER', Predicted='OTHER'
Row 18: True='OTHER', Predicted='OTHER'
Row 19: True='OTHER', Predicted='OTHER'
Row 20: True='LOC_C', Predicted='OTHER'
Row 21: True='OTHER', Predicted='OTHER'
Row 22: True='OTHER', Predicted='OTH

In [110]:
lengths_sample

tensor([  975,  1892, 25346])

In [106]:
print("Label distribution in training set:")
all_train_labels_flat = [label.item() for seq in train_labels for label in seq] # Assuming train_labels is a list of tensors
# Or from your original df_processed_rows before splitting and encoding:
# print(df_processed_rows['label'].value_counts(normalize=True))
print(pd.Series(label_encoder.inverse_transform(all_train_labels_flat)).value_counts(normalize=True))

print("\nLabel distribution in validation set:")
all_val_labels_flat = [label.item() for seq in val_labels for label in seq]
print(pd.Series(label_encoder.inverse_transform(all_val_labels_flat)).value_counts(normalize=True))

Label distribution in training set:
DATA     0.616791
OTHER    0.234108
BLANK    0.101482
HDR      0.017152
LOC_B    0.017077
LOC_C    0.009027
LOC_P    0.004363
Name: proportion, dtype: float64

Label distribution in validation set:
DATA     0.537351
OTHER    0.283741
BLANK    0.102323
LOC_B    0.035782
HDR      0.023227
LOC_C    0.011299
LOC_P    0.006277
Name: proportion, dtype: float64
