# Image and Text Feature Extractor for RIMAS dataset

## Setup and libraries imports

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
import os
import sys

import cv2
from tqdm import tqdm
import numpy as np
import pandas as pd
from PIL import Image
from pathlib import Path
from typing import List, Tuple

sys.path.append(str(Path(os.getcwd()).resolve().parent.parent))

from src.core.config.config import Config
from src.ml.embeddings.feature_extractor import FeatureExtractor

config = Config()

## Research Workflow: Image & Text Feature Extraction

### Goal

Identify the most effective feature extraction method (for both images and text) by experimenting with multiple approaches and comparing classification performance.

---

## Image Feature Extraction

### Methods to Compare

* Flattening approach
* HOG (Histogram of Oriented Gradients)
* LBP (Local Binary Patterns)
* SIFT (Scale-Invariant Feature Transform)
* SURF (Speeded-Up Robust Features)
* (Optional: add more methods)

### Workflow

1. Extract features using each method.
2. Construct a consolidated `DataFrame` with all feature sets.
3. Train classification models on each feature representation.
4. Evaluate classifiers on the task: detecting the presence of specific letters in word images.
5. Compare metrics across models and methods.
6. Select the best-performing image feature extractor.

---

## Text Feature Extraction

### Starting Point

* Bag of letters representation

### Next Steps

1. Implement bag of letters as baseline.
2. Experiment with additional encoders (TF-IDF, n-grams, etc.).
3. Train classifiers on text-based features.
4. Evaluate and compare performance.

---

## Evaluation

* **Metrics:** Accuracy, Precision, Recall, F1-score (and others if needed).
* **Outcome:** Best image feature extractor + best text feature extractor → Final approaches for classification tasks.


### Flatten Image Embeddings

In [7]:
feature_extractor = FeatureExtractor(
    dataset_path=config.DATASET_PATH,
    target_size=config.TARGET_SIZE,
    image_embeddings_path=config.IMAGE_EMBEDDINGS_PATH,
    encoder_type=config.ENCODER_TYPE,
    load_flag=False
)

print(feature_extractor.image_embeddings_list[:1])

INFO:core.loaders.data_loader:Loaded 28475 text-image pairs
100%|██████████| 100/100 [00:04<00:00, 24.79it/s]
INFO:utils.saving_utils.save_embeddings:Image embeddings saved to /home/nikolay/Deloitte/RIMAS/src/data/processed/words/weights/image_embeddings.parquet successfully.


[{'image_embedding': array([19.508982 , 15.610779 , 11.281437 , 14.832335 , 13.826347 ,
       13.017964 , 13.275449 , 19.233534 , 55.017963 , 29.550898 ,
       11.934132 ,  8.8503   , 14.42515  , 18.712574 , 23.838324 ,
       36.407185 , 35.269463 , 14.886228 ,  7.3293414, 18.233534 ,
       44.09581  , 34.976048 , 26.964071 , 29.365269 , 13.598803 ,
        8.095808 ,  8.7365265, 17.862276 , 37.497005 , 25.898203 ,
       20.592813 , 16.57485  , 34.45509  , 13.646707 , 13.8503   ,
       18.532934 , 18.479042 , 19.173653 , 14.48503  , 20.33533  ,
       95.628746 , 32.035927 , 19.886227 , 23.221558 , 23.886227 ,
       25.628742 , 27.706587 , 48.526947 , 60.79042  , 29.976048 ,
       17.54491  , 33.221558 , 67.17365  , 45.38922  , 30.622755 ,
       29.58084  , 16.443113 , 12.838324 , 15.311378 , 28.508982 ,
       55.353294 , 28.994013 , 18.323353 , 17.035929 , 34.54491  ,
       16.772455 , 14.652695 , 20.82036  , 18.467066 , 17.221558 ,
       14.251497 , 16.257484 , 95.21557  