Copyright (C) 2024 Konstantin Touev  
All Rights Reserved.

TODO: cite dataset: https://huggingface.co/datasets/pierreguillou/DocLayNet-small

In [89]:
from django.contrib.admin import display

if __name__ == "__main__":
    from dotenv import load_dotenv

    load_dotenv()

from typing import List
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from file_processing.llm_chat_support import get_llm, LLMTemp, LLMTypes
from pydantic import BaseModel, Field


# Define LLM-globals
class ConflictCountFormat(BaseModel):
    conflict_instances: List[str] = Field(description="List of conflicting information instances found in the text.")

structured_model = (get_llm(LLMTemp.CONCRETE, LLMTypes.BIG_VISUAL_MODEL)
                    .with_structured_output(ConflictCountFormat, method="json_mode"))

output_parser = PydanticOutputParser(pydantic_object=ConflictCountFormat)

count_conflicts = PromptTemplate(
    input_variables=['text'],
    partial_variables={'format_instructions': output_parser.get_format_instructions()},
    template="""
    You are tasked with identifying conflicting information in the following text.

    Conflicting information refers to:
    - Contradictory statements, facts, or data that disagree with each other.
    - Information that cannot logically coexist or causes confusion due to inconsistency.
    
    Provide all instances of such conflicts as a list of arguments. Each argument should succinctly summarize the conflict.

    Text:
    {text}

    {format_instructions}
    """
)


In [19]:
import pandas as pd
from datasets import load_dataset

ds = load_dataset("pierreguillou/DocLayNet-small", trust_remote_code=True)
dataset_df = pd.concat([
    ds['train'].to_pandas(),
    ds['validation'].to_pandas(),
    ds['test'].to_pandas()
])

Ok, the plan is to consolidate the dataset splits and consolidate the files into a list.  
Then sample and test if conflicts are across files or are internal to files.

In [72]:
agg_function = {
    'texts': lambda x: ' '.join([' '.join(page) for page in x]),
    'collection': 'first',
    'doc_category': 'first'
}

result = (
    dataset_df.sort_values(by=['original_filename', 'page_no'])
    .groupby('original_filename')
    .agg(agg_function)
)
display(result)

Unnamed: 0_level_0,texts,collection,doc_category
original_filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00-80T-80.pdf,NAVWEPS 00-8OT-80 HIGH SPEED AERODYNAMICS ai...,faa_regulations,laws_and_regulations
0000003545-19-000143_20191205x10k.pdf,"Note 5. Property and Equipment, Net Property a...",sec_filings,financial_reports
0000004904-20-000007_20200220x10k.pdf,SYSTEM TRANSMISSION LINES AND FACILITY SITING ...,sec_filings,financial_reports
0000004904-21-000010_20210225x10k.pdf,Transource Energy Desert Sky Wind Farm LLC and...,sec_filings,financial_reports
0000005513-21-000015_20210217x10k.pdf,Note 3 - Investments - Continued 157 Shown bel...,sec_filings,financial_reports
...,...,...,...
uksi_20210501_en.pdf,S T A T U T O R Y I N S T R U M E N T S 2021 N...,gb_laws,laws_and_regulations
uksi_20210525_en.pdf,(b) the offence may for all incidental purpose...,gb_laws,laws_and_regulations
uksi_20210534_en.pdf,SCHEDULES SCHEDULE 1 Administrative procedur...,gb_laws,laws_and_regulations
uksi_20210548_en.pdf,ISBN 978-0-34-822324-8 £4.90 http://www.legis...,gb_laws,laws_and_regulations


In [44]:
import random

collection_types = result['collection'].sample(n=14).sort_values().tolist()
random.shuffle(collection_types)
print(f"Sampled 14 collection types: {collection_types}")

for collection_type in list(collection_types):
    documents = result[result['collection'] == collection_type]
    if len(documents) < 6:
        collection_types.remove(collection_type)
collection_types = collection_types[:6]
print(f"Sampled 6 collection types with at least 6 documents: {collection_types}")

Sampled 14 collection types: ['ann_reports_00_04_fancy', 'russian_laws', 'patents_wp', 'arxiv_two_columns', 'patents_cn', 'patents_wp', 'patents_jp', 'arxiv_doublespaced', 'arxiv_mediumspaced', 'ann_reports_00_04_fancy', 'ann_reports_00_04_fancy', 'ann_reports_00_04_fancy', 'eu_tenders', 'ann_reports_00_04_fancy']
Sampled 6 collection types with at least 6 documents: ['ann_reports_00_04_fancy', 'patents_wp', 'arxiv_two_columns', 'patents_wp', 'arxiv_doublespaced', 'arxiv_mediumspaced']


Count number of conflicts per category (collection type)

In [52]:
conflicts_per_category_data = []

for collection_type in collection_types:
    documents = ' '.join(result[result['collection'] == collection_type]["texts"].tolist())
    try:
        response = (count_conflicts | structured_model).invoke({
            "text": documents
        })
        display(response)
        conflicts_per_category_data.append({
            'collection_type': collection_type,
            'conflicts': response.conflict_instances,
            'conflicts_count': len(response.conflict_instances),
        })
    except Exception as e:
        print(f"Error counting conflicts for category '{collection_type}': {e}")
        conflicts_per_category_data.append({
            'collection_type': collection_type,
            'conflicts': str(e),
            'conflicts_count': -1,
        })

conflicts_per_category_df = pd.DataFrame(conflicts_per_category_data)
display(conflicts_per_category_df)

Unnamed: 0,collection_type,conflicts,conflicts_count
0,ann_reports_00_04_fancy,[The text states that the philosophy of the Pr...,4
1,patents_wp,[The text mentions 'encode one or more assignm...,3
2,arxiv_two_columns,[The text mentions that 'the ratio of q and 1 ...,6
3,patents_wp,[The text mentions 'encode one or more assignm...,5
4,arxiv_doublespaced,[The text mentions that the comoving mean matt...,8
5,arxiv_mediumspaced,[The text states that the LEP I/II experiments...,5


In [88]:
conflicts_per_document_data = []

display(collection_types)
for collection_type in collection_types:
    documents = result[result['collection'] == collection_type]
    for _, document in documents.iterrows():
        try:
            response = (count_conflicts | structured_model).invoke({
                "text": document["texts"]
            })
            display(response)
            conflicts_per_document_data.append({
                'collection_type': collection_type,
                'document_name': document.name,
                'conflicts': response.conflict_instances,
                'conflicts_count': len(response.conflict_instances),
            })
        except Exception as e:
            print(f"Error counting conflicts for document '{document['original_filename']}' in category '{collection_type}': {e}")
            conflicts_per_document_data.append({
                'collection_type': collection_type,
                'document_name': document.name,
                'conflicts': str(e),
                'conflicts_count': -1,
            })

conflicts_per_document_df = pd.DataFrame(conflicts_per_document_data)
display(conflicts_per_document_df)

Unnamed: 0,collection_type,document_name,conflicts,conflicts_count
0,ann_reports_00_04_fancy,AMEX_IMH_2000.pdf,[The Progressive Series philosophy states that...,2
1,ann_reports_00_04_fancy,AMEX_NBR_2002.pdf,[],0
2,ann_reports_00_04_fancy,ASX_HVN_2003.pdf,[The total reserves for the consolidated entit...,3
3,ann_reports_00_04_fancy,ASX_STO_2004.pdf,[The total current assets for 2004 are listed ...,18
4,ann_reports_00_04_fancy,ASX_SUL_2004.pdf,[The text states that Nick Binns has overall r...,2
...,...,...,...,...
229,arxiv_mediumspaced,1002.3948.pdf,[The text states that the fit to the data α fi...,2
230,arxiv_mediumspaced,1002.4052.pdf,[The text mentions U$_{q}$ is expressed by a c...,8
231,arxiv_mediumspaced,1002.4278.pdf,[The text states that the discovery of the acc...,2
232,arxiv_mediumspaced,1002.4666.pdf,"[Statement A suggests that w < ∼ M^2, while St...",3
