In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import os
os.makedirs("/root/.kaggle/", exist_ok=True)

import shutil
shutil.copy("/kaggle/input/kaggle/kaggle.json", "/root/.kaggle/kaggle.json")
os.chmod("/root/.kaggle/kaggle.json", 600)




In [2]:
!kaggle competitions download -c icaif-24-finance-rag-challenge


import zipfile
with zipfile.ZipFile("icaif-24-finance-rag-challenge.zip", "r") as zip_ref:
    zip_ref.extractall("data")

Downloading icaif-24-finance-rag-challenge.zip to /kaggle/working
 99%|█████████████████████████████████████▊| 19.0M/19.1M [00:01<00:00, 19.1MB/s]
100%|██████████████████████████████████████| 19.1M/19.1M [00:01<00:00, 11.4MB/s]


In [3]:
import os
import pandas as pd
folders = [ele for ele in os.listdir("./data") if ele.split(".")[-1] == "jsonl"]

datasets = {folder : pd.read_json("./data/"+folder+"/corpus.jsonl", lines = True) for folder in folders if folder.find("corpus") != -1}
queries = {folder : pd.read_json("./data/"+folder+"/queries.jsonl", lines = True) for folder in folders if folder.find("queries") != -1}

In [4]:
!pip install faiss-gpu
!pip install sentence_transformers
!pip install rank_bm25
!pip install num2words

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Collecting sentence_transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.8/255.8 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.2.1
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Insta

In [5]:
import os
from typing import List, Dict, Any, Set
import numpy as np
import faiss
import json
from tqdm import tqdm
import torch
import torch.cuda.amp
from sentence_transformers import SentenceTransformer, CrossEncoder
from dataclasses import dataclass
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from rank_bm25 import BM25Okapi
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import re
from sklearn.feature_extraction.text import TfidfVectorizer

@dataclass
class SearchResult:
    id: str
    title: str
    text: str
    score: float
    keywords: List[str] = None

class EnhancedHybridRAG:
    def __init__(
        self, 
        encoder_model_name: str = 'BAAI/bge-large-en-v1.5',
        reranker_model_name: str = 'BAAI/bge-reranker-large',
        index_path: str = 'faiss_index',
        metadata_path: str = 'document_metadata.json',
        alpha: float = 0.6,
        beta: float = 0.2,
        num_workers: int = 4
    ):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.encoder = SentenceTransformer(encoder_model_name).to(self.device)
        self.reranker = CrossEncoder(reranker_model_name, device=self.device)
        
        self.index_path = index_path
        self.metadata_path = metadata_path
        self.index = None
        self.metadata = []
        self.gpu_index = None
        self.document_texts = {}
        self.alpha = alpha
        self.beta = beta
        self.num_workers = num_workers
        
        # Initialize text processing
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
        nltk.download('wordnet', quiet=True)
        self.stop_words = set(stopwords.words('english'))
        self.tfidf = TfidfVectorizer(
            stop_words='english',
            ngram_range=(1, 2),
            max_features=10000
        )
        
        # Initialize search components
        self.bm25 = None
        self.tokenized_corpus = None
        self.keyword_index = {}
        self.document_keywords = {}

    def extract_keywords(self, text: str) -> Set[str]:
        """Extract important keywords from text"""
        # Clean and tokenize
        text = re.sub(r'[^\w\s]', ' ', text.lower())
        words = word_tokenize(text)
        
        # Remove stopwords and short words
        words = [w for w in words if w not in self.stop_words and len(w) > 2]
        
        # Get word frequencies
        word_freq = Counter(words)
        
        # Return top keywords
        return set(dict(word_freq.most_common(10)).keys())

    def preprocess_text(self, text: str) -> List[str]:
        """Enhanced text preprocessing"""
        # Clean text
        text = re.sub(r'[^\w\s]', ' ', text.lower())
        # Tokenize
        tokens = word_tokenize(text)
        # Remove stopwords and short words
        tokens = [t for t in tokens if t not in self.stop_words and len(t) > 2]
        return tokens

    def encode_documents(self, documents: List[Dict[str, Any]], batch_size: int = 32):
        print("Encoding documents...")
        dimension = self.encoder.get_sentence_embedding_dimension()

        # Build TF-IDF index
        print("Building TF-IDF index...")
        doc_texts = [doc['text'] for doc in documents]
        self.tfidf.fit(doc_texts)

        # Extract keywords for all documents
        print("Extracting keywords...")
        for doc in tqdm(documents, desc="Processing keywords"):
            keywords = self.extract_keywords(doc['text'])
            self.document_keywords[doc['id']] = list(keywords)

            # Build keyword index
            for keyword in keywords:
                if keyword not in self.keyword_index:
                    self.keyword_index[keyword] = set()
                self.keyword_index[keyword].add(doc['id'])

        # Initialize FAISS index
        n_docs = len(documents)
        n_clusters = min(int(np.sqrt(n_docs)), max(1, n_docs // 4))

        # Create CPU index first
        if n_docs < 1000:
            self.index = faiss.IndexFlatL2(dimension)
        else:
            quantizer = faiss.IndexFlatL2(dimension)
            self.index = faiss.IndexIVFFlat(quantizer, dimension, n_clusters)

        # Process documents first to get vectors for training
        print("Processing documents...")
        train_vectors = []
        tokenized_corpus = []

        for doc in tqdm(documents, desc="Processing documents"):
            text = f"passage: {doc['text']}"
            tokenized_corpus.append(self.preprocess_text(doc['text']))

            with torch.no_grad():
                embedding = self.encoder.encode(text, convert_to_tensor=True).cpu().numpy()
            train_vectors.append(embedding)
            self.document_texts[doc['id']] = doc['text']

        train_vectors = np.array(train_vectors).astype('float32')

        # Initialize GPU resources and transfer index
        if torch.cuda.is_available():
            print("Moving index to GPU...")
            res = faiss.StandardGpuResources()

            if isinstance(self.index, faiss.IndexFlatL2):
                self.gpu_index = faiss.GpuIndexFlatL2(res, dimension)
            else:
                config = faiss.GpuIndexIVFFlatConfig()
                config.useFloat16 = True
                self.gpu_index = faiss.GpuIndexIVFFlat(
                    res, 
                    dimension,
                    n_clusters,
                    faiss.METRIC_L2
                )
                # Train IVF index
                print("Training IVF index...")
                self.gpu_index.train(train_vectors)
        else:
            print("No GPU available, using CPU index...")
            self.gpu_index = self.index
            if isinstance(self.index, faiss.IndexIVFFlat):
                self.index.train(train_vectors)

        # Initialize BM25
        self.bm25 = BM25Okapi(tokenized_corpus)
        self.tokenized_corpus = tokenized_corpus

        # Batch add vectors to index
        print("Adding vectors to index...")
        for i in tqdm(range(0, len(documents), batch_size), desc="Indexing"):
            batch = documents[i:i+batch_size]
            texts = [f"passage: {doc['text']}" for doc in batch]

            with torch.no_grad():
                embeddings = self.encoder.encode(texts, convert_to_tensor=True).cpu().numpy()

            self.gpu_index.add(embeddings)

            for doc in batch:
                self.metadata.append({
                    'id': doc['id'],
                    'title': doc['title'],
                    'text': doc['text'],
                    'keywords': list(self.document_keywords[doc['id']])
                })

        # Save index and metadata
        print("Saving index and metadata...")
        if torch.cuda.is_available():
            self.index = faiss.index_gpu_to_cpu(self.gpu_index)
        else:
            self.index = self.gpu_index

        faiss.write_index(self.index, self.index_path)

        # Save metadata with proper serialization
        serializable_corpus = [list(tokens) for tokens in self.tokenized_corpus]
        with open(self.metadata_path, 'w') as f:
            json.dump({
                'metadata': self.metadata,
                'document_texts': self.document_texts,
                'tokenized_corpus': serializable_corpus,
                'document_keywords': self.document_keywords,
                'keyword_index': {k: list(v) for k, v in self.keyword_index.items()}
            }, f)

        print("Indexing completed successfully.")
            
    def load_index(self):
        if os.path.exists(self.index_path) and os.path.exists(self.metadata_path):
            self.index = faiss.read_index(self.index_path)
            
            if torch.cuda.is_available():
                res = faiss.StandardGpuResources()
                if isinstance(self.index, faiss.IndexFlatL2):
                    self.gpu_index = faiss.GpuIndexFlatL2(res, self.index.d)
                    self.gpu_index.copyFrom(self.index)
                elif isinstance(self.index, faiss.IndexIVFFlat):
                    config = faiss.GpuIndexIVFFlatConfig()
                    config.useFloat16 = True
                    self.gpu_index = faiss.GpuIndexIVFFlat(
                        res, 
                        self.index.d,
                        self.index.nlist,
                        faiss.METRIC_L2
                    )
                    self.gpu_index.copyFrom(self.index)
            else:
                self.gpu_index = self.index
            
            with open(self.metadata_path, 'r') as f:
                data = json.load(f)
                self.metadata = data['metadata']
                self.document_texts = data['document_texts']
                self.tokenized_corpus = [list(tokens) for tokens in data['tokenized_corpus']]
                self.document_keywords = data['document_keywords']
                self.keyword_index = {k: set(v) for k, v in data['keyword_index'].items()}
                self.bm25 = BM25Okapi(self.tokenized_corpus)
            
            print(f"Loaded index with {self.gpu_index.ntotal} vectors.")
        else:
            print("Index files not found. Please encode documents first.")

    def get_keyword_matches(self, query: str, doc_id: str) -> float:
        """Calculate keyword matching score"""
        query_keywords = self.extract_keywords(query)
        doc_keywords = set(self.document_keywords.get(doc_id, []))
        if not doc_keywords:
            return 0.0
        
        matches = len(query_keywords.intersection(doc_keywords))
        return matches / len(query_keywords) if query_keywords else 0.0

    def batch_search(
        self, 
        queries: List[str], 
        k: int = 10,
        rerank_k: int = 25,
        rerank_batch_size: int = 64
    ) -> List[List[SearchResult]]:
        if self.gpu_index is None:
            self.load_index()

        all_results = []

        # Optimize GPU memory for P100
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.memory.set_per_process_memory_fraction(0.9)
            torch.backends.cudnn.benchmark = True
            torch.backends.cuda.matmul.allow_tf32 = True

        for query_idx, query in enumerate(queries, 1):
            print(f"\nProcessing query {query_idx}/{len(queries)}: {query}")

            query_text = f"query: {query}"

            with torch.no_grad():
                query_vector = self.encoder.encode(
                    query_text,
                    convert_to_tensor=True,
                    batch_size=1,
                    show_progress_bar=False
                ).cpu().numpy().astype('float32').reshape(1, -1)

                query_tokens = self.preprocess_text(query)
                bm25_scores = self.bm25.get_scores(query_tokens)
                max_bm25 = max(bm25_scores) + 1e-6
                query_keywords = self.extract_keywords(query)

            if isinstance(self.index, faiss.IndexIVFFlat):
                self.gpu_index.nprobe = min(8, self.gpu_index.nlist)
            dense_distances, dense_indices = self.gpu_index.search(query_vector, rerank_k)

            results = []
            seen_ids = set()

            for idx, distance in zip(dense_indices[0], dense_distances[0]):
                if idx != -1 and idx < len(self.metadata):
                    doc_id = self.metadata[idx]['id']
                    if doc_id not in seen_ids:
                        seen_ids.add(doc_id)

                        # Get document text from metadata
                        doc_text = self.metadata[idx]['text']  # Fixed: Get text from metadata

                        dense_score = 1 / (1 + distance)
                        sparse_score = bm25_scores[idx] / max_bm25
                        keyword_score = self.get_keyword_matches(query, doc_id)

                        combined_score = (
                            self.alpha * dense_score + 
                            (1 - self.alpha - self.beta) * sparse_score +
                            self.beta * keyword_score
                        )

                        results.append({
                            'id': doc_id,
                            'title': self.metadata[idx]['title'],
                            'text': doc_text,  # Now doc_text is defined
                            'score': combined_score,
                            'keywords': self.document_keywords[doc_id]
                        })

            if results:
                results = sorted(results, key=lambda x: x['score'], reverse=True)[:rerank_k]
                rerank_pairs = [(query, res['text']) for res in results]

                with torch.cuda.amp.autocast():
                    with torch.no_grad():
                        rerank_scores = self.reranker.predict(
                            rerank_pairs,
                            batch_size=rerank_batch_size,
                            show_progress_bar=False
                        )

                final_results = [
                    SearchResult(
                        id=res['id'],
                        title=res['title'],
                        text=res['text'],
                        score=float(score),
                        keywords=res['keywords']
                    )
                    for res, score in zip(results, rerank_scores)
                ]

                final_results = sorted(final_results, key=lambda x: x.score, reverse=True)[:k]
                all_results.append(final_results)
                print(f"Found {len(final_results)} results")
            else:
                all_results.append([])

        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        return all_results
    


    
documents = [
    {'id': str(1), 'title': f'Document my name', 'text': 'My Name is rahul verma'},
    {'id': str(2), 'title': f'Document current city', 'text': 'I currently live in chandigarh punjab'},
    {'id': str(3), 'title': f'Document pet name', 'text': 'I have a cat name rishi'},
    {'id': str(110), 'title': f'financial document', 'text': 'The following tables present the recorded investment by portfolio segment and by class, excluding commercial financing receivables and other miscellaneous financing receivables at December 31, 2019 and 2018. Commercial financing receivables are excluded from the presentation of financing receivables by portfolio segment, as they are short term in nature and the current estimated risk of loss and resulting impact to the company’s financing results are not material. Write-offs of lease receivables and loan receivables were $16 million and $47 million, respectively, for the year ended December 31, 2019. Provisions for credit losses recorded for lease receivables and loan receivables were a release of $6 million and an addition of $2 million, respectively, for the year ended December 31, 2019. The average recorded investment of impaired leases and loans for Americas, EMEA and Asia Pacific was $138 million, $49 million and $45 million, respectively, for the year ended December 31, 2019. Both interest income recognized, and interest income recognized on a cash basis on impaired leases and loans were immaterial for the year ended December 31, 2019. ($ in millions) | | | | ---------------------------------------------------------- | -------- | ------ | ------------ | -------- At December 31, 2019: | Americas | EMEA | Asia Pacific | Total Recorded investment: | | | | Lease receivables | $ 3,419 | $1,186 | $ 963 | $ 5,567 Loan receivables | 6,726 | 3,901 | 2,395 | 13,022 Ending balance | $10,144 | $5,087 | $3,359 | $18,590 Recorded investment, collectively evaluated for impairment | $10,032 | $5,040 | $3,326 | $18,399 Recorded investment, individually evaluated for impairment | $ 112 | $ 47 | $ 32 | $ 191 Allowance for credit losses | | | | Beginning balance at January 1, 2019 | | | | Lease receivables | $ 53 | $ 22 | $ 24 | $ 99 Loan receivables | 105 | 43 | 32 | 179 Total | $ 158 | $ 65 | $ 56 | $ 279 Write-offs | (42) | (3) | (18) | (63) Recoveries | 1 | 0 | 1 | 2 Provision | 5 | (7) | (3) | (5) Other* | (1) | 0 | (1) | (2) Ending balance at December 31, 2019 | $ 120 | $ 54 | $ 36 | $ 210 Lease receivables | $ 33 | $ 23 | $ 16 | $ 72 Loan receivables | $ 88 | $ 31 | $ 20 | $ 138 Related allowance, collectively evaluated for impairment | $ 25 | $ 11 | $ 4 | $ 39 Related allowance, individually evaluated for impairment | $ 96 | $ 43 | $ 32 | $ 171'}
]

documents += [{'id': str(i), 'title': f'Document random {i}', 'text': f'This is a random document {i}'} for i in range(1, 100)]

rag = FinancialEnhancedRAG()


# # Encode and index documents
rag.encode_documents(documents)

# # Later, when you want to search:
# query = "What is the name of Rahul's pet"
# results = rag.search(query, k=1)

# print("Top relevant document:")
# for result in results:
#     print(f"ID: {result['id']}, Title: {result['title']}, Score: {result['score']:.4f}")

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

Encoding documents...
Building TF-IDF index...
Extracting keywords...


Processing keywords: 100%|██████████| 103/103 [00:00<00:00, 3498.82it/s]


Processing documents...


Processing documents:   0%|          | 0/103 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   1%|          | 1/103 [00:00<01:08,  1.50it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   3%|▎         | 3/103 [00:00<00:21,  4.74it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   5%|▍         | 5/103 [00:00<00:14,  6.92it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   7%|▋         | 7/103 [00:01<00:10,  9.47it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   9%|▊         | 9/103 [00:01<00:07, 11.78it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  11%|█         | 11/103 [00:01<00:06, 13.60it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  13%|█▎        | 13/103 [00:01<00:06, 14.76it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  15%|█▍        | 15/103 [00:01<00:05, 15.61it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  17%|█▋        | 17/103 [00:01<00:05, 16.09it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  18%|█▊        | 19/103 [00:01<00:04, 16.89it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  20%|██        | 21/103 [00:01<00:04, 17.38it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  23%|██▎       | 24/103 [00:01<00:04, 19.44it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  26%|██▌       | 27/103 [00:02<00:03, 21.31it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  29%|██▉       | 30/103 [00:02<00:03, 22.41it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  32%|███▏      | 33/103 [00:02<00:03, 23.28it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  35%|███▍      | 36/103 [00:02<00:02, 23.16it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  38%|███▊      | 39/103 [00:02<00:02, 23.25it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  41%|████      | 42/103 [00:02<00:02, 23.55it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  44%|████▎     | 45/103 [00:02<00:02, 24.07it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  47%|████▋     | 48/103 [00:02<00:02, 23.89it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  50%|████▉     | 51/103 [00:03<00:02, 23.05it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  52%|█████▏    | 54/103 [00:03<00:02, 23.77it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  55%|█████▌    | 57/103 [00:03<00:01, 23.64it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  58%|█████▊    | 60/103 [00:03<00:01, 23.51it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  61%|██████    | 63/103 [00:03<00:01, 23.88it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  64%|██████▍   | 66/103 [00:03<00:01, 23.74it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  67%|██████▋   | 69/103 [00:03<00:01, 23.44it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  70%|██████▉   | 72/103 [00:03<00:01, 23.27it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  73%|███████▎  | 75/103 [00:04<00:01, 22.23it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  76%|███████▌  | 78/103 [00:04<00:01, 23.23it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  79%|███████▊  | 81/103 [00:04<00:00, 22.93it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  82%|████████▏ | 84/103 [00:04<00:00, 22.75it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  84%|████████▍ | 87/103 [00:04<00:00, 22.61it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  87%|████████▋ | 90/103 [00:04<00:00, 21.92it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  90%|█████████ | 93/103 [00:04<00:00, 21.62it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  93%|█████████▎| 96/103 [00:05<00:00, 21.70it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  96%|█████████▌| 99/103 [00:05<00:00, 22.91it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  99%|█████████▉| 102/103 [00:05<00:00, 23.31it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents: 100%|██████████| 103/103 [00:05<00:00, 19.39it/s]


Moving index to GPU...
Adding vectors to index...


Indexing:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Indexing:  25%|██▌       | 1/4 [00:01<00:05,  1.74s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Indexing:  75%|███████▌  | 3/4 [00:01<00:00,  2.00it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Indexing: 100%|██████████| 4/4 [00:01<00:00,  2.09it/s]

Saving index and metadata...
Indexing completed successfully.





In [8]:
query = "What is Rahul's pet"
rag.batch_search(['What were the write offs in 2019', query, "What is Rahul's city", "Was there a name mentioned?"]
            )


Processing query 1/4: What were the write offs in 2019


  with torch.cuda.amp.autocast():


Found 25 results

Processing query 2/4: What is Rahul's pet
Found 25 results

Processing query 3/4: What is Rahul's city
Found 25 results

Processing query 4/4: Was there a name mentioned?
Found 25 results


[[], [], [], []]

In [27]:
def dataset_processor(documents, queries):
    documents = [
        {'id': row["_id"], 'title': row["title"], 'text': row["title"]+". "+row["text"]} for _, row in documents.iterrows()
    ]
    rag = FinancialEnhancedRAG()
    rag.encode_documents(documents)
    
    
    result = {"query_id":[], "corpus_id": []}
    output = rag.batch_search([query["text"] for _, query in queries.iterrows()])
    for _output, query in zip(output, queries.iterrows()):
        for _result in _output:
            result["query_id"].append(query[1]["_id"])
            result["corpus_id"].append(_result.id)
    del rag
    df = pd.DataFrame(result)
    return df 
        
    

In [17]:
datasets.keys()

dict_keys(['finder_corpus.jsonl', 'finqabench_corpus.jsonl', 'multiheirtt_corpus.jsonl', 'convfinqa_corpus.jsonl', 'financebench_corpus.jsonl', 'finqa_corpus.jsonl', 'tatqa_corpus.jsonl'])

In [28]:
for key in datasets:
    if key not in ['finder_corpus.jsonl', 'financebench_corpus.jsonl', 'finqabench_corpus.jsonl']:
        query_file = key.split("_")[0] + "_queries.jsonl"
        output_file = key.split("_")[0]+ ".tsv"
        output = dataset_processor(datasets[key], queries[query_file])
        output.to_csv(output_file, sep = "\t")    


Encoding documents...
Building TF-IDF index...
Extracting keywords...


Processing keywords: 100%|██████████| 2066/2066 [00:12<00:00, 169.50it/s]


Processing documents...


Processing documents:   0%|          | 0/2066 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   0%|          | 1/2066 [00:00<03:36,  9.56it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   0%|          | 3/2066 [00:00<03:23, 10.16it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   0%|          | 5/2066 [00:00<03:19, 10.31it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   0%|          | 7/2066 [00:00<03:11, 10.74it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   0%|          | 9/2066 [00:00<03:12, 10.71it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   1%|          | 11/2066 [00:01<03:08, 10.88it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   1%|          | 13/2066 [00:01<03:08, 10.91it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   1%|          | 15/2066 [00:01<03:10, 10.79it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   1%|          | 17/2066 [00:01<03:08, 10.86it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   1%|          | 19/2066 [00:01<03:10, 10.76it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   1%|          | 21/2066 [00:01<03:11, 10.70it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   1%|          | 23/2066 [00:02<03:09, 10.79it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   1%|          | 25/2066 [00:02<03:12, 10.61it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   1%|▏         | 27/2066 [00:02<03:10, 10.69it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   1%|▏         | 29/2066 [00:02<03:13, 10.54it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   2%|▏         | 31/2066 [00:02<03:07, 10.86it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   2%|▏         | 33/2066 [00:03<03:06, 10.93it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   2%|▏         | 35/2066 [00:03<03:09, 10.72it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   2%|▏         | 37/2066 [00:03<03:09, 10.70it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   2%|▏         | 39/2066 [00:03<03:08, 10.77it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   2%|▏         | 41/2066 [00:03<03:07, 10.80it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   2%|▏         | 43/2066 [00:04<03:07, 10.80it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   2%|▏         | 45/2066 [00:04<03:06, 10.81it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   2%|▏         | 47/2066 [00:04<03:05, 10.88it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   2%|▏         | 49/2066 [00:04<03:08, 10.71it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   2%|▏         | 51/2066 [00:04<03:08, 10.69it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   3%|▎         | 53/2066 [00:04<03:09, 10.63it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   3%|▎         | 55/2066 [00:05<03:00, 11.13it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   3%|▎         | 57/2066 [00:05<03:04, 10.88it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   3%|▎         | 59/2066 [00:05<03:05, 10.80it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   3%|▎         | 61/2066 [00:05<03:06, 10.73it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   3%|▎         | 63/2066 [00:05<03:07, 10.69it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   3%|▎         | 65/2066 [00:06<03:04, 10.85it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   3%|▎         | 67/2066 [00:06<03:05, 10.79it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   3%|▎         | 69/2066 [00:06<03:03, 10.89it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   3%|▎         | 71/2066 [00:06<03:02, 10.91it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   4%|▎         | 73/2066 [00:06<02:56, 11.26it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   4%|▎         | 75/2066 [00:06<03:00, 11.03it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   4%|▎         | 77/2066 [00:07<03:02, 10.88it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   4%|▍         | 79/2066 [00:07<03:03, 10.84it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   4%|▍         | 81/2066 [00:07<03:05, 10.69it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   4%|▍         | 83/2066 [00:07<03:03, 10.80it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   4%|▍         | 85/2066 [00:07<03:02, 10.87it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   4%|▍         | 87/2066 [00:08<03:02, 10.84it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   4%|▍         | 89/2066 [00:08<03:03, 10.76it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   4%|▍         | 91/2066 [00:08<03:03, 10.74it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   5%|▍         | 93/2066 [00:08<03:03, 10.77it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   5%|▍         | 95/2066 [00:08<03:03, 10.74it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   5%|▍         | 97/2066 [00:08<03:02, 10.79it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   5%|▍         | 99/2066 [00:09<03:04, 10.68it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   5%|▍         | 101/2066 [00:09<03:06, 10.55it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   5%|▍         | 103/2066 [00:09<03:05, 10.58it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   5%|▌         | 105/2066 [00:09<03:05, 10.59it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   5%|▌         | 107/2066 [00:09<03:03, 10.69it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   5%|▌         | 109/2066 [00:10<03:03, 10.67it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   5%|▌         | 111/2066 [00:10<03:03, 10.64it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   5%|▌         | 113/2066 [00:10<03:03, 10.66it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   6%|▌         | 115/2066 [00:10<03:02, 10.71it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   6%|▌         | 117/2066 [00:10<03:02, 10.71it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   6%|▌         | 119/2066 [00:11<03:03, 10.63it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   6%|▌         | 121/2066 [00:11<03:01, 10.74it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   6%|▌         | 123/2066 [00:11<03:02, 10.63it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   6%|▌         | 125/2066 [00:11<03:00, 10.74it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   6%|▌         | 127/2066 [00:11<03:02, 10.60it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   6%|▌         | 129/2066 [00:12<03:01, 10.68it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   6%|▋         | 131/2066 [00:12<03:02, 10.59it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   6%|▋         | 133/2066 [00:12<03:01, 10.64it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   7%|▋         | 135/2066 [00:12<03:00, 10.69it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   7%|▋         | 137/2066 [00:12<03:02, 10.57it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   7%|▋         | 139/2066 [00:12<03:01, 10.63it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   7%|▋         | 141/2066 [00:13<03:00, 10.64it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   7%|▋         | 143/2066 [00:13<03:01, 10.57it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   7%|▋         | 145/2066 [00:13<02:59, 10.69it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   7%|▋         | 147/2066 [00:13<02:57, 10.81it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   7%|▋         | 149/2066 [00:13<02:49, 11.28it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   7%|▋         | 151/2066 [00:14<02:53, 11.04it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   7%|▋         | 153/2066 [00:14<02:54, 10.94it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   8%|▊         | 155/2066 [00:14<02:59, 10.65it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   8%|▊         | 157/2066 [00:14<02:46, 11.50it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   8%|▊         | 159/2066 [00:14<02:48, 11.33it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   8%|▊         | 161/2066 [00:14<02:51, 11.10it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   8%|▊         | 163/2066 [00:15<02:50, 11.13it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   8%|▊         | 165/2066 [00:15<02:51, 11.10it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   8%|▊         | 167/2066 [00:15<02:52, 11.01it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   8%|▊         | 169/2066 [00:15<02:53, 10.92it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   8%|▊         | 171/2066 [00:15<02:53, 10.92it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   8%|▊         | 173/2066 [00:16<02:49, 11.16it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   8%|▊         | 175/2066 [00:16<02:49, 11.17it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   9%|▊         | 177/2066 [00:16<02:50, 11.05it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   9%|▊         | 179/2066 [00:16<02:53, 10.90it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   9%|▉         | 181/2066 [00:16<02:45, 11.41it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   9%|▉         | 183/2066 [00:16<02:47, 11.23it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   9%|▉         | 185/2066 [00:17<02:50, 11.05it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   9%|▉         | 187/2066 [00:17<02:54, 10.76it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   9%|▉         | 189/2066 [00:17<02:55, 10.72it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   9%|▉         | 191/2066 [00:17<02:52, 10.87it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   9%|▉         | 193/2066 [00:17<02:51, 10.94it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   9%|▉         | 195/2066 [00:18<02:53, 10.79it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  10%|▉         | 197/2066 [00:18<02:46, 11.23it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  10%|▉         | 199/2066 [00:18<02:41, 11.59it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  10%|▉         | 201/2066 [00:18<02:44, 11.35it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  10%|▉         | 203/2066 [00:18<02:44, 11.31it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  10%|▉         | 205/2066 [00:18<02:46, 11.15it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  10%|█         | 207/2066 [00:19<02:51, 10.84it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  10%|█         | 209/2066 [00:19<02:51, 10.85it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  10%|█         | 211/2066 [00:19<02:53, 10.68it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  10%|█         | 213/2066 [00:19<02:51, 10.80it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  10%|█         | 215/2066 [00:19<02:51, 10.82it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  11%|█         | 217/2066 [00:20<02:51, 10.75it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  11%|█         | 219/2066 [00:20<02:52, 10.69it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  11%|█         | 221/2066 [00:20<02:53, 10.62it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  11%|█         | 223/2066 [00:20<02:53, 10.62it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  11%|█         | 225/2066 [00:20<02:56, 10.45it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  11%|█         | 227/2066 [00:20<02:52, 10.64it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  11%|█         | 229/2066 [00:21<02:46, 11.01it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  11%|█         | 231/2066 [00:21<02:47, 10.97it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  11%|█▏        | 233/2066 [00:21<02:47, 10.98it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  11%|█▏        | 235/2066 [00:21<02:46, 10.97it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  11%|█▏        | 237/2066 [00:21<02:58, 10.23it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  12%|█▏        | 239/2066 [00:22<02:55, 10.40it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  12%|█▏        | 241/2066 [00:22<02:54, 10.46it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  12%|█▏        | 243/2066 [00:22<02:54, 10.43it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  12%|█▏        | 245/2066 [00:22<02:54, 10.43it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  12%|█▏        | 247/2066 [00:22<02:48, 10.81it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  12%|█▏        | 249/2066 [00:23<02:47, 10.85it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  12%|█▏        | 251/2066 [00:23<02:40, 11.28it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  12%|█▏        | 253/2066 [00:23<02:46, 10.87it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  12%|█▏        | 255/2066 [00:23<02:46, 10.88it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  12%|█▏        | 257/2066 [00:23<02:45, 10.90it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  13%|█▎        | 259/2066 [00:23<02:47, 10.78it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  13%|█▎        | 261/2066 [00:24<02:44, 10.94it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  13%|█▎        | 263/2066 [00:24<02:44, 10.95it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  13%|█▎        | 265/2066 [00:24<02:45, 10.85it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  13%|█▎        | 267/2066 [00:24<02:45, 10.85it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  13%|█▎        | 269/2066 [00:24<02:48, 10.64it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  13%|█▎        | 271/2066 [00:25<02:46, 10.76it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  13%|█▎        | 273/2066 [00:25<02:51, 10.45it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  13%|█▎        | 275/2066 [00:25<02:49, 10.55it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  13%|█▎        | 277/2066 [00:25<02:49, 10.53it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  14%|█▎        | 279/2066 [00:25<02:48, 10.60it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  14%|█▎        | 281/2066 [00:25<02:41, 11.03it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  14%|█▎        | 283/2066 [00:26<02:43, 10.87it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  14%|█▍        | 285/2066 [00:26<02:42, 10.98it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  14%|█▍        | 287/2066 [00:26<02:42, 10.92it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  14%|█▍        | 289/2066 [00:26<02:45, 10.75it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  14%|█▍        | 291/2066 [00:26<02:46, 10.69it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  14%|█▍        | 293/2066 [00:27<02:44, 10.78it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  14%|█▍        | 295/2066 [00:27<02:43, 10.83it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  14%|█▍        | 297/2066 [00:27<02:44, 10.72it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  14%|█▍        | 299/2066 [00:27<02:48, 10.49it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  15%|█▍        | 301/2066 [00:27<02:47, 10.51it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  15%|█▍        | 303/2066 [00:28<02:47, 10.50it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  15%|█▍        | 305/2066 [00:28<02:46, 10.59it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  15%|█▍        | 307/2066 [00:28<02:46, 10.55it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  15%|█▍        | 309/2066 [00:28<02:46, 10.58it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  15%|█▌        | 311/2066 [00:28<02:46, 10.53it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  15%|█▌        | 313/2066 [00:29<02:46, 10.55it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  15%|█▌        | 315/2066 [00:29<02:44, 10.62it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  15%|█▌        | 317/2066 [00:29<02:45, 10.57it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  15%|█▌        | 319/2066 [00:29<02:42, 10.76it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  16%|█▌        | 321/2066 [00:29<02:40, 10.86it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  16%|█▌        | 323/2066 [00:29<02:37, 11.10it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  16%|█▌        | 325/2066 [00:30<02:37, 11.07it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  16%|█▌        | 327/2066 [00:30<02:40, 10.81it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  16%|█▌        | 329/2066 [00:30<02:40, 10.85it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  16%|█▌        | 331/2066 [00:31<07:03,  4.10it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  16%|█▌        | 333/2066 [00:31<05:45,  5.02it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  16%|█▌        | 335/2066 [00:32<04:51,  5.94it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  16%|█▋        | 337/2066 [00:32<04:12,  6.83it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  16%|█▋        | 339/2066 [00:32<03:43,  7.71it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  17%|█▋        | 341/2066 [00:32<03:24,  8.43it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  17%|█▋        | 343/2066 [00:32<03:11,  8.98it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  17%|█▋        | 345/2066 [00:32<03:02,  9.43it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  17%|█▋        | 347/2066 [00:33<02:52,  9.94it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  17%|█▋        | 349/2066 [00:33<02:47, 10.27it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  17%|█▋        | 351/2066 [00:33<02:40, 10.66it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  17%|█▋        | 353/2066 [00:33<02:40, 10.66it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  17%|█▋        | 355/2066 [00:33<02:38, 10.82it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  17%|█▋        | 357/2066 [00:34<02:37, 10.82it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  17%|█▋        | 359/2066 [00:34<02:38, 10.79it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  17%|█▋        | 361/2066 [00:34<02:33, 11.13it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  18%|█▊        | 363/2066 [00:34<02:34, 10.99it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  18%|█▊        | 365/2066 [00:34<02:36, 10.85it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  18%|█▊        | 367/2066 [00:34<02:37, 10.77it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  18%|█▊        | 369/2066 [00:35<02:37, 10.77it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  18%|█▊        | 371/2066 [00:35<02:37, 10.75it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  18%|█▊        | 373/2066 [00:35<02:34, 10.99it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  18%|█▊        | 375/2066 [00:35<02:33, 10.99it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  18%|█▊        | 377/2066 [00:35<02:35, 10.84it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  18%|█▊        | 379/2066 [00:36<02:39, 10.58it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  18%|█▊        | 381/2066 [00:36<02:38, 10.61it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  19%|█▊        | 383/2066 [00:36<02:38, 10.64it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  19%|█▊        | 385/2066 [00:36<02:36, 10.75it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  19%|█▊        | 387/2066 [00:36<02:35, 10.80it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  19%|█▉        | 389/2066 [00:37<02:34, 10.84it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  19%|█▉        | 391/2066 [00:37<02:35, 10.76it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  19%|█▉        | 393/2066 [00:37<02:35, 10.77it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  19%|█▉        | 395/2066 [00:37<02:34, 10.80it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  19%|█▉        | 397/2066 [00:37<02:34, 10.81it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  19%|█▉        | 399/2066 [00:37<02:32, 10.93it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  19%|█▉        | 401/2066 [00:38<02:28, 11.24it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  20%|█▉        | 403/2066 [00:38<02:30, 11.04it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  20%|█▉        | 405/2066 [00:38<02:31, 10.99it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  20%|█▉        | 407/2066 [00:38<02:33, 10.84it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  20%|█▉        | 409/2066 [00:38<02:32, 10.88it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  20%|█▉        | 411/2066 [00:39<02:32, 10.87it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  20%|█▉        | 413/2066 [00:39<02:29, 11.02it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  20%|██        | 415/2066 [00:39<02:25, 11.36it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  20%|██        | 417/2066 [00:39<02:28, 11.10it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  20%|██        | 419/2066 [00:39<02:32, 10.83it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  20%|██        | 421/2066 [00:39<02:34, 10.63it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  20%|██        | 423/2066 [00:40<02:32, 10.77it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  21%|██        | 425/2066 [00:40<02:31, 10.83it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  21%|██        | 427/2066 [00:40<02:33, 10.70it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  21%|██        | 429/2066 [00:40<02:33, 10.64it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  21%|██        | 431/2066 [00:40<02:24, 11.32it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  21%|██        | 433/2066 [00:41<02:29, 10.92it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  21%|██        | 435/2066 [00:41<02:28, 10.97it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  21%|██        | 437/2066 [00:41<02:28, 10.95it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  21%|██        | 439/2066 [00:41<02:30, 10.79it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  21%|██▏       | 441/2066 [00:41<02:30, 10.77it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  21%|██▏       | 443/2066 [00:41<02:32, 10.64it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  22%|██▏       | 445/2066 [00:42<02:26, 11.10it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  22%|██▏       | 447/2066 [00:42<02:30, 10.78it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  22%|██▏       | 449/2066 [00:42<02:33, 10.53it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  22%|██▏       | 451/2066 [00:42<02:34, 10.49it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  22%|██▏       | 453/2066 [00:42<02:20, 11.50it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  22%|██▏       | 455/2066 [00:43<02:23, 11.19it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  22%|██▏       | 457/2066 [00:43<02:19, 11.51it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  22%|██▏       | 459/2066 [00:43<02:27, 10.93it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  22%|██▏       | 461/2066 [00:43<02:26, 10.93it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  22%|██▏       | 463/2066 [00:43<02:26, 10.91it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  23%|██▎       | 465/2066 [00:43<02:28, 10.81it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  23%|██▎       | 467/2066 [00:44<02:27, 10.85it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  23%|██▎       | 469/2066 [00:44<02:26, 10.87it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  23%|██▎       | 471/2066 [00:44<02:20, 11.34it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  23%|██▎       | 473/2066 [00:44<02:22, 11.18it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  23%|██▎       | 475/2066 [00:44<02:18, 11.47it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  23%|██▎       | 477/2066 [00:45<02:15, 11.72it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  23%|██▎       | 479/2066 [00:45<02:21, 11.22it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  23%|██▎       | 481/2066 [00:45<02:24, 10.96it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  23%|██▎       | 483/2066 [00:45<02:24, 10.96it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  23%|██▎       | 485/2066 [00:45<02:23, 11.01it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  24%|██▎       | 487/2066 [00:45<02:24, 10.91it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  24%|██▎       | 489/2066 [00:46<02:27, 10.67it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  24%|██▍       | 491/2066 [00:46<02:27, 10.68it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  24%|██▍       | 493/2066 [00:46<02:26, 10.71it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  24%|██▍       | 495/2066 [00:46<02:26, 10.76it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  24%|██▍       | 497/2066 [00:46<02:26, 10.71it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  24%|██▍       | 499/2066 [00:47<02:25, 10.74it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  24%|██▍       | 501/2066 [00:47<02:25, 10.77it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  24%|██▍       | 503/2066 [00:47<02:24, 10.82it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  24%|██▍       | 505/2066 [00:47<02:25, 10.76it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  25%|██▍       | 507/2066 [00:47<02:28, 10.50it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  25%|██▍       | 509/2066 [00:48<02:28, 10.50it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  25%|██▍       | 511/2066 [00:48<02:26, 10.64it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  25%|██▍       | 513/2066 [00:48<02:23, 10.83it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  25%|██▍       | 515/2066 [00:48<02:25, 10.69it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  25%|██▌       | 517/2066 [00:48<02:26, 10.59it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  25%|██▌       | 519/2066 [00:48<02:25, 10.61it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  25%|██▌       | 521/2066 [00:49<02:26, 10.56it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  25%|██▌       | 523/2066 [00:49<02:25, 10.57it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  25%|██▌       | 525/2066 [00:49<02:24, 10.66it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  26%|██▌       | 527/2066 [00:49<02:25, 10.58it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  26%|██▌       | 529/2066 [00:49<02:25, 10.60it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  26%|██▌       | 531/2066 [00:50<02:24, 10.66it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  26%|██▌       | 533/2066 [00:50<02:23, 10.65it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  26%|██▌       | 535/2066 [00:50<02:22, 10.72it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  26%|██▌       | 537/2066 [00:50<02:23, 10.64it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  26%|██▌       | 539/2066 [00:50<02:22, 10.72it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  26%|██▌       | 541/2066 [00:51<02:17, 11.10it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  26%|██▋       | 543/2066 [00:51<02:20, 10.85it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  26%|██▋       | 545/2066 [00:51<02:20, 10.83it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  26%|██▋       | 547/2066 [00:51<02:20, 10.84it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  27%|██▋       | 549/2066 [00:51<02:19, 10.86it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  27%|██▋       | 551/2066 [00:51<02:18, 10.94it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  27%|██▋       | 553/2066 [00:52<02:18, 10.96it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  27%|██▋       | 555/2066 [00:52<02:15, 11.16it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  27%|██▋       | 557/2066 [00:52<02:17, 10.97it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  27%|██▋       | 559/2066 [00:52<02:19, 10.81it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  27%|██▋       | 561/2066 [00:52<02:18, 10.89it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  27%|██▋       | 563/2066 [00:53<02:14, 11.16it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  27%|██▋       | 565/2066 [00:53<02:16, 10.99it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  27%|██▋       | 567/2066 [00:53<02:18, 10.79it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  28%|██▊       | 569/2066 [00:53<02:18, 10.78it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  28%|██▊       | 571/2066 [00:53<02:22, 10.48it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  28%|██▊       | 573/2066 [00:53<02:21, 10.58it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  28%|██▊       | 575/2066 [00:54<02:22, 10.45it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  28%|██▊       | 577/2066 [00:54<02:21, 10.54it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  28%|██▊       | 579/2066 [00:54<02:18, 10.73it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  28%|██▊       | 581/2066 [00:54<02:19, 10.67it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  28%|██▊       | 583/2066 [00:54<02:19, 10.64it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  28%|██▊       | 585/2066 [00:55<02:19, 10.59it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  28%|██▊       | 587/2066 [00:55<02:21, 10.47it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  29%|██▊       | 589/2066 [00:55<02:18, 10.66it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  29%|██▊       | 591/2066 [00:55<02:19, 10.59it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  29%|██▊       | 593/2066 [00:55<02:19, 10.59it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  29%|██▉       | 595/2066 [00:56<02:19, 10.58it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  29%|██▉       | 597/2066 [00:56<02:16, 10.74it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  29%|██▉       | 599/2066 [00:56<02:18, 10.56it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  29%|██▉       | 601/2066 [00:56<02:17, 10.64it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  29%|██▉       | 603/2066 [00:56<02:17, 10.63it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  29%|██▉       | 605/2066 [00:56<02:16, 10.72it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  29%|██▉       | 607/2066 [00:57<02:11, 11.12it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  29%|██▉       | 609/2066 [00:57<02:11, 11.11it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  30%|██▉       | 611/2066 [00:57<02:12, 11.02it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  30%|██▉       | 613/2066 [00:57<02:13, 10.91it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  30%|██▉       | 615/2066 [00:57<02:13, 10.87it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  30%|██▉       | 617/2066 [00:58<02:14, 10.81it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  30%|██▉       | 619/2066 [00:58<02:05, 11.49it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  30%|███       | 621/2066 [00:58<02:09, 11.13it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  30%|███       | 623/2066 [00:58<02:10, 11.06it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  30%|███       | 625/2066 [00:58<02:11, 10.92it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  30%|███       | 627/2066 [00:58<02:13, 10.78it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:  30%|███       | 627/2066 [00:59<02:15, 10.61it/s]


KeyboardInterrupt: 

In [2]:
import pandas as pd
import glob

# Path to the folder containing the TSV files to merge
folder_path = '/kaggle/input/new-results'

# Get all the TSV file paths in the folder
tsv_files = glob.glob(f"{folder_path}/*.tsv")

# Create an empty list to store dataframes
dataframes = []

# Read the first file with the header, and the rest without headers
for i, file in enumerate(tsv_files):
    if i == 0:
        df = pd.read_csv(file, sep='\t')  # Read the first file with the header
    else:
        df = pd.read_csv(file, sep='\t', header=None)  # Read the rest without the header
        df.columns = dataframes[0].columns  # Use the header from the first file
    dataframes.append(df)

# Concatenate all the dataframes
merged_df = pd.concat(dataframes, ignore_index=True)

# Save the merged dataframe to a new TSV file
#merged_df.to_csv('merged_output.csv', index=False)

In [3]:
merged_df = merged_df[["query_id", "corpus_id"]]

In [4]:
merged_df.to_csv('merged_output_reiteration.csv', index=False)

In [21]:
!kaggle competitions submit -c icaif-24-finance-rag-challenge -f merged_output_reiteration.csv -m "new results"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|████████████████████████████████████████| 911k/911k [00:00<00:00, 1.50MB/s]
Successfully submitted to ACM-ICAIF '24 FinanceRAG Challenge