# <b>Cross-lingual information retrieval</b>: <b>Pipeline workflows</b>

****

## <b>Project preparation steps</b>

### Import required libraries.

In [1]:
# Data processing libraries.
import pandas as pd
import numpy as np
import re
import string
import unicodedata
import emoji
import math

# Nltk libraries for text cleaning and processing.
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer

import stopwordsiso as stopwords
from unidecode import unidecode

# Import libraries for text cleaning.
from bs4 import BeautifulSoup

# Thread pooling.
from multiprocessing.dummy import Pool

# Import system specific libraries.
import os
import glob
import yaml
from tqdm import tqdm

# Import fast text library for language detection.
import fasttext

# Import libraries for performance evaluation and measurements.
import time
import torch

# Import FAISS library for indexing embedded vectors.
import faiss

# Sentence transformer based models.
from sentence_transformers import SentenceTransformer

# Import pickle for saving and loading objects.
import pickle

# Import sqlite3 library for storing metadata.
import sqlite3

# Ignore future and deprecated warnings to get cleaner output.
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Logs.
import logging

In [2]:
# Download the required nltk packages.
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/himanshusharma/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/himanshusharma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/himanshusharma/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Configuration logs

In [3]:
# Configure logging
logging.basicConfig(
    filename="pipeline.log",         # log file name
    level=logging.INFO,              # logging level (INFO, DEBUG, ERROR)
    format="%(asctime)s - %(levelname)s - %(message)s",  # log format
    filemode="w"                     # overwrite log file each run ("a" to append)
)

logger = logging.getLogger(__name__)

### Define constants

In [4]:
CONFIG_DIRECTORY_PATH = "config"
DATASET_DIRECTORY_PATH = "datasets"
DATA_DIRECTORY_PATH = "data"
MULTILINGUAL_DOCUMENTS_DIRECTORY_PATH = "datasets/multilingual_documents"

### Helping functions

#### 1. Load configuration file.

In [5]:
# Load project specific configuration file.
def load_config(filename):
    config_file_path = f"{CONFIG_DIRECTORY_PATH}/{filename}.yml"
    with open(config_file_path, "r") as f:
        config = yaml.safe_load(f)
    
    # Return config file.
    return config

#### 2. Get Language detection model.

In [6]:
# Get pre-trained language detection model.
def get_langauge_detection_model(language_detection_config):
    model = language_detection_config['model']
    pre_trained_model_filepath = f"{DATA_DIRECTORY_PATH}/{model}"
    if not os.path.exists(pre_trained_model_filepath):
        raise FileNotFoundError(f"{pre_trained_model_filepath} not found. Download it from model's website.")
    else:
        return fasttext.load_model(pre_trained_model_filepath)

### Global variables

In [7]:
# Load configured data.
site_metadata_config = load_config('sites-metadata');
project_config = load_config('project');

# Load fast track model for language detection.
fast_track_language_detection_model = get_langauge_detection_model(project_config['language_detection']);

### 3. Get metadata

In [8]:
def get_metadata():
    return project_config['metadata']

*****

# <b>Text preprocessing pipeline</b>

### Helper functions to preprocess text data

#### 1. Clean text.

In [9]:
# Clean text to remove html formattings, emojis, puntuations and normalize spaces.
def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    
    # Remove html formats.
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # Remove emojis.
    text = emoji.replace_emoji(text, replace="")
    
    # Remove puntuations.
    text = re.sub(r"[^\w\s]", " ", text)
    
    # Remove normalize spaces.
    text = re.sub(r"\s+", " ", text).strip()

    # Remove url links.
    text = re.sub(r"http\S+|www\S+", "", text)

    return text

#### 2. Text normalisation.

In [10]:
# Normalize text.
def normalize_text(text: str) -> str:
    text = text.lower()
    text = unidecode(text)
    return text

#### 3. Tokenization and filter.

In [11]:
# Tokenize texts in sentences and words, and remove stopwords.
def tokenize_and_filter(row, axis = 1):
    text = row['cleaned_text']
    lang = row['language']

    # Sentence tokenize.
    sentences = sent_tokenize(text)

    # Word tokenize
    words = word_tokenize(text)

    # remove stopwords if available for that language.
    if stopwords.has_lang(lang):
        sw = stopwords.stopwords(lang)
        words = [w for w in words if w not in sw]

    return pd.Series({"sentences": sentences, "tokens": words})
    

#### 4. Comibined all processeing steps in a single function.

In [12]:
# ---------------- Text Cleaning ----------------
def process_text(text: str):
    if not isinstance(text, str) or not text.strip():
        return None
    
    text = clean_text(text)
    text = normalize_text(text)
    
    return text

#### 4. Language detection process

In [13]:
# ---------------- Single-Batch Language Detection ----------------
def detect_language_batch(text_batch):
    labels, _ = fast_track_language_detection_model.predict(text_batch, k=1)
    return [lbl[0].replace("__label__", "") if lbl else "unknown" for lbl in labels]

4. Thread based batch language detection process.

In [14]:
# ---------------- Threaded Batch Language Detection ----------------
def batch_detect_language_parallel(texts, batch_size=1000):
    chunks = [texts[i:i+batch_size] for i in range(0, len(texts), batch_size)]
    languages = []

    with Pool() as pool:  # ThreadPool
        for batch_result in tqdm(pool.imap(detect_language_batch, chunks), total=len(chunks), desc="Language Detection"):
            languages.extend(batch_result)
    
    return languages

#### 5. Preprocessing single dataframe.

In [15]:
# ---------------- Preprocess Single DataFrame ----------------
def preprocess_dataframe_parallel(df, metadata):
    """
    Takes a dataframe and a text column, returns new Dataframe with 
    cleaned, normalized, language, tokens.
    """
    initial_count = len(df)
    text_column = metadata['text_column']
    batch_size = metadata['batch_size']
    
    # Remove duplicates
    tqdm.pandas(desc="Removing duplicates....")
    df = df.drop_duplicates(subset=[text_column])
    duplicates_removed = initial_count - len(df)
    
    # Clean text with progress bar
    tqdm.pandas(desc="Cleaning text....")
    df['cleaned_text'] = df[text_column].progress_apply(process_text)

    
    # Remove empty cleaned text
    df = df[df['cleaned_text'].notna()].reset_index(drop=True)
    
    # Detect languages in parallel
    texts = df['cleaned_text'].tolist()
    df['language'] = batch_detect_language_parallel(texts, batch_size=batch_size)

    # Remove unknown languages
    df = df[df['language'] != "unknown"].reset_index(drop=True)

    # tokenization, processin.
    tqdm.pandas(desc="Tokenizing text....")
    df_tokens = df.progress_apply(tokenize_and_filter, axis = 1)
    df = df.join(df_tokens)
    
    # Capture site stats
    site_stats = {
        "total_rows": initial_count,
        "duplicates_removed": duplicates_removed,
        "rows_kept": len(df),
        "languages_detected": df['language'].unique().tolist()
    }
    
    return df, site_stats

*****

# <b> Data preparation pipeline.</b>

### Multiple csv files reading and processing with summary.

In [16]:
# ---------------- Multi-Site CSV Pipeline with Summary ----------------
def process_all_sites_with_summary(metadata):

    # Get list of sites.
    sites = site_metadata_config.get('sites', [])

    # Prepare return variables.
    all_dfs = []
    summary_list = []

    for site_csv in sites:
        csv_directory = f"{MULTILINGUAL_DOCUMENTS_DIRECTORY_PATH}/{site_csv}"
        print(csv_directory)

        # ---------------- Get list of CSV files ----------------
        csv_files = glob.glob(os.path.join(csv_directory, "*.csv"))
        logger.info(f"Found {len(csv_files)} CSV files.")

        # ---------------- Read all CSVs and combine ----------------
        for file in csv_files:
            if os.path.exists(file):
                input_df = pd.read_csv(file)
                df = input_df.copy()
                logger.info(f"[INFO] Processing site: {site_csv} ({len(df)} rows)")

                # Trigger cleaning of dataframes.
                df_cleaned, site_stats = preprocess_dataframe_parallel(df, metadata)
                logger.info(f"[INFO] Done {site_csv}: {site_stats['duplicates_removed']} duplicates removed, {site_stats['rows_kept']} rows kept")
                site_stats["site"] = site_csv
                summary_list.append(site_stats)
                all_dfs.append(df_cleaned)
            else:
                
                logger.info(f"[WARNING] File not found: {file}")
    
    # Merge all cleaned DataFrames
    if all_dfs:
        merged_df = pd.concat(all_dfs, ignore_index=True)
        merged_df = merged_df.drop_duplicates(subset=[metadata['text_column'], 'cleaned_text']).reset_index(drop=True)
        logger.info(f"[INFO] Merged DataFrame contains {len(merged_df)} unique rows after deduplication")
        
        # Save if requested
        save_path = f"{DATA_DIRECTORY_PATH}/{metadata['processed_file_name']}.csv"
        save_format = metadata['data_file_format']
        if save_path:
            if save_format.lower() == "csv":
                merged_df.to_csv(save_path, index=False)
            elif save_format.lower() == "parquet":
                merged_df.to_parquet(save_path, index=False)
            else:
                print(f"[WARNING] Unknown save_format '{save_format}'. Skipping save.")
            logger.info(f"[INFO] Saved merged DataFrame to {save_path}")
        
        # Create summary DataFrame
        summary_df = pd.DataFrame(summary_list)
        print("\n[INFO] Site Summary Table:")
        print(summary_df)
        
        return merged_df, summary_df
    else:
        print("[INFO] No valid data found in any site CSVs.")
        return pd.DataFrame(), pd.DataFrame()
    

In [17]:
# Get the list of sites and candidate languages configurations.
metadata = get_metadata()

# Trigger pipeline.
processed_df, summary_df = process_all_sites_with_summary(metadata)


datasets/multilingual_documents/india


Cleaning text....: 100%|██████████| 511/511 [00:00<00:00, 611.24it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_text'] = df[text_column].progress_apply(process_text)
Language Detection: 100%|██████████| 1/1 [00:00<00:00, 12671.61it/s]
Tokenizing text....: 100%|██████████| 511/511 [00:00<00:00, 1030.97it/s]
Cleaning text....: 100%|██████████| 189/189 [00:01<00:00, 143.85it/s]
Language Detection: 100%|██████████| 1/1 [00:00<00:00, 8473.34it/s]
Tokenizing text....: 100%|██████████| 189/189 [00:00<00:00, 399.28it/s]



[INFO] Site Summary Table:
   total_rows  duplicates_removed  rows_kept  \
0         609                  98        511   
1         189                   0        189   

                 languages_detected   site  
0  [en, es, fr, sl, sw, zh, pl, el]  india  
1                          [en, es]  india  


### 1. Data collection

#### Sites
1. Global (https://www.unicef.org/)
2. Armenia (https://www.unicef.org/armenia/)
3. Bangladesh (https://www.unicef.org/bangladesh/)
4. Cambodia (https://www.unicef.org/cambodia/)
5. China (https://www.unicef.org/china/)
6. ECA (https://www.unicef.org/eca/)
7. India (https://www.unicef.org/india/)
8. Myanmar (https://www.unicef.org/myanmar)
9. Peru (https://www.unicef.org/peru/)
10. Vietnam (https://www.unicef.org/vietnam/)

#### Low level Language

#### Collect text data of press releases and articles from all candidate sites.

#### Get CSV data into dataframe and execute pre-processing steps.

1. Read data from csv files.

******

## Load processed and cleaned data.

In [18]:
save_path = f"{DATA_DIRECTORY_PATH}/{project_config['metadata']['processed_file_name']}.csv"
df = pd.read_csv(save_path)

In [20]:
df

Unnamed: 0,title,text,cleaned_text,language,sentences,tokens
0,A new start to life for little Durga,<p>Little Durga pinches her brother Vikram and...,little durga pinches her brother vikram and qu...,en,['little durga pinches her brother vikram and ...,"['durga', 'pinches', 'brother', 'vikram', 'dis..."
1,A new start to life for little Durga,<p>And Manju’s happiness is for a reason. Durg...,and manju s happiness is for a reason durga s ...,en,['and manju s happiness is for a reason durga ...,"['manju', 'happiness', 'reason', 'durga', 'sho..."
2,A new start to life for little Durga,<h4>A new beginning</h4>\r\n\r\n<p>Durga was b...,a new beginning durga was born wih low birth w...,en,['a new beginning durga was born wih low birth...,"['durga', 'born', 'wih', 'birth', 'weight', '2..."
3,A new start to life for little Durga,<p>Close to 39 percent children under five yea...,close to 39 percent children under five years ...,en,['close to 39 percent children under five year...,"['close', 'percent', 'children', 'age', 'malno..."
4,A new start to life for little Durga,<p>Manju made sure that Durga was regular to t...,manju made sure that durga was regular to the ...,en,['manju made sure that durga was regular to th...,"['manju', 'durga', 'regular', 'anganwadi', 'ce..."
...,...,...,...,...,...,...
695,Department of Mass Communication and Journalis...,"<p><strong>HYDERABAD, India, 22 May 2025—Creat...",hyderabad india 22 may 2025 creating safer roa...,en,['hyderabad india 22 may 2025 creating safer r...,"['hyderabad', 'india', '22', '2025', 'creating..."
696,Radio Professionals Come Together To Discuss I...,"<p>NEW DELHI, 3 June 2025 – India’s top radio ...",new delhi 3 june 2025 india s top radio voices...,en,['new delhi 3 june 2025 india s top radio voic...,"['delhi', '3', 'june', '2025', 'india', 'radio..."
697,अंतरराष्ट्रीय खेल दिवस और विश्व पर्यावरण दिवस ...,"<p><strong>नई दिल्ली, 3 जून 2025 –</strong> ऑल...",nii d l l 3 j n 2025 oNl i dd y r dd y n j eph...,es,['nii d l l 3 j n 2025 oNl i dd y r dd y n j e...,"['nii', '2025', 'oNl', 'dd', 'dd', 'ephem', 'n..."
698,GoI and UNICEF: Peer-Support Critical for Adol...,"<p><strong>BHOPAL, 22 July 2025 –</strong> To ...",bhopal 22 july 2025 to strengthen the support ...,en,['bhopal 22 july 2025 to strengthen the suppor...,"['bhopal', '22', 'july', '2025', 'strengthen',..."


In [None]:
summary_df

******

## <b>Model Embedding Pipeline</b>

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # avoids fork warning

import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from concurrent.futures import ThreadPoolExecutor, as_completed

class EmbeddingPipeline:
    def __init__(self, model_name="all-MiniLM-L6-v2", index_dir="./index_store", batch_size=512, n_workers=4):
        self.model = SentenceTransformer(model_name)
        self.index_dir = index_dir
        os.makedirs(index_dir, exist_ok=True)
        self.index = None
        self.batch_size = batch_size
        self.dim = None
        self.texts = []
        self.n_workers = n_workers

    def generate_embeddings_batch(self, texts, save_path=None):
        """Generate embeddings in batches, normalize for cosine similarity, optionally save to disk."""
        all_embeddings = []
        for start in range(0, len(texts), self.batch_size):
            batch_texts = texts[start:start+self.batch_size]
            batch_embeddings = self.model.encode(batch_texts, convert_to_numpy=True)
            faiss.normalize_L2(batch_embeddings)
            all_embeddings.append(batch_embeddings)
            print(f"Processed batch {start}-{start+len(batch_texts)}")
        all_embeddings = np.vstack(all_embeddings)
        print(f"Total embeddings shape: {all_embeddings.shape}")

        if save_path:
            np.save(save_path, all_embeddings)
            print(f"Saved embeddings to {save_path}")

        self.dim = all_embeddings.shape[1]
        self.texts = texts
        return all_embeddings

    def build_index(self, embeddings, index_type="flat", **kwargs):
        """Build FAISS index (flat, hnsw, or ivf) with cosine similarity."""
        if self.dim is None:
            self.dim = embeddings.shape[1]

        if index_type == "flat":
            self.index = faiss.IndexFlatIP(self.dim)

        elif index_type == "hnsw":
            M = kwargs.get("M", 16)
            efConstruction = kwargs.get("efConstruction", 100)
            self.index = faiss.IndexHNSWFlat(self.dim, M, faiss.METRIC_INNER_PRODUCT)
            self.index.hnsw.efConstruction = efConstruction

        elif index_type == "ivf":
            nlist = kwargs.get("nlist", 100)
            quantizer = faiss.IndexFlatIP(self.dim)
            self.index = faiss.IndexIVFFlat(quantizer, self.dim, nlist, faiss.METRIC_INNER_PRODUCT)
            print("Training IVF index...")
            self.index.train(embeddings)

        else:
            raise ValueError("Unsupported index type. Use 'flat', 'hnsw', or 'ivf'.")

        # Add in batches
        for i in range(0, embeddings.shape[0], self.batch_size):
            self.index.add(embeddings[i:i+self.batch_size])

    def save_index(self, name="index.faiss"):
        path = os.path.join(self.index_dir, name)
        faiss.write_index(self.index, path)
        print(f"Index saved at {path}")

    def load_index(self, name="index.faiss"):
        path = os.path.join(self.index_dir, name)
        self.index = faiss.read_index(path)
        print(f"Index loaded from {path}")

    def search(self, query, top_k=5, as_df=True):
        query_embedding = self.model.encode([query], convert_to_numpy=True)
        faiss.normalize_L2(query_embedding)
        D, I = self.index.search(query_embedding, top_k)

        if as_df:
            results = []
            for rank, (score, idx) in enumerate(zip(D[0], I[0]), start=1):
                results.append({
                    "query": query,
                    "rank": rank,
                    "index": int(idx),
                    "cosine_similarity": float(score),
                    "text": self.texts[idx] if self.texts else None
                })
            return pd.DataFrame(results)
        return D, I

    def search_batch(self, queries, top_k=5):
        query_embeddings = self.model.encode(queries, convert_to_numpy=True)
        faiss.normalize_L2(query_embeddings)
        D, I = self.index.search(query_embeddings, top_k)

        all_results = []
        for q_idx, query in enumerate(queries):
            for rank, (score, idx) in enumerate(zip(D[q_idx], I[q_idx]), start=1):
                all_results.append({
                    "query": query,
                    "rank": rank,
                    "index": int(idx),
                    "cosine_similarity": float(score),
                    "text": self.texts[idx] if self.texts else None
                })
        return pd.DataFrame(all_results)

In [None]:
pipeline = EmbeddingPipeline(batch_size=1024, n_workers=8)

In [None]:
sentences = df['cleaned_text']

In [None]:
# Step 1: Generate embeddings in batches
embeddings = pipeline.generate_embeddings_batch(sentences.to_list(), save_path="embeddings_100k.npy")

In [None]:
# Build HNSW index using threads
pipeline.build_index(embeddings, index_type="flat", M=16, efConstruction=100)

In [None]:

# Step 4: Save index
pipeline.save_index("faiss_hnsw_parallel.faiss")

### Load saved vector indexed data.

In [None]:
pipeline = EmbeddingPipeline()
pipeline.load("large_index.faiss", "metadata_large.pkl")

In [None]:
# Step 3: Search batch queries
queries = ["Unicef kids", "उम्मीदों की नई सुबह", "mundo"]
df_results = pipeline.search_batch(queries, top_k=5)

In [None]:
df_results

*****

# Query Processing Pipeline

### Load index for query processing.

In [None]:
# Later reload:
index = faiss.read_index("paraphrase-multilingual-MiniLM-L12-v2__indexes.faiss")

#### Query to indexed data.

In [None]:
# Query FAISS
query = "situation reports in hindi"
query_vec = model.encode([query], convert_to_numpy=True).astype("float32")
D, I = index.search(query_vec, k=3)

## Print results.

In [None]:
# Fetch metadata from SQLite
print("\nSearch results:")
I[0][0]