# Python Code
### Automation of validating quotes given by generative AI output to identify page number

In [436]:
#Importing packages for search functions
import PyPDF2
import re
import pandas as pd
import os

In [437]:
# Read the CSV file into a DataFrame
outputs = os.listdir("ComparisonTables")
print(outputs)

dataframes = []

common_columns = [
    'Report#1',
    'Report#1_Quote',
    'Report#2',
    'Report#2_Quote'
]

# Read each CSV file into a DataFrame, rename the columns, and append to the list
for output in outputs:
    df = pd.read_csv("ComparisonTables/" + output)
    df.columns = common_columns[:len(df.columns)]  # Rename columns to the common names
    dataframes.append(df)

# Combine all DataFrames into one
df = pd.concat(dataframes, ignore_index=True)

# # FOR TESTING
# df = df.iloc[0:3]

['ComparisonTable.csv', 'ComparisonTable2.csv']


In [438]:
df

Unnamed: 0,Report#1,Report#1_Quote,Report#2,Report#2_Quote
0,Impact of Food Systems on Health and Well-being,"""Food systems may exert both positive and nega...",Focus on Nutritional Value of Products,"""We aim to produce nutritious and high-quality..."
1,Unhealthy Diets as a Key Risk Factor for Globa...,"""Unhealthy diets are among the key risk factor...",Addressing Lifestyle Diseases,"""We are actively engaged in addressing lifesty..."
2,Importance of Nutritional Quality in Food Prod...,"""The nutritional quality of foods produced and...",Product Reformulation for Nutritional Improvement,"""We have reformulated several of our products ..."
3,Malnutrition and Food Insecurity,"""Malnutrition in all its forms increases susce...",Tackling Malnutrition,"""FrieslandCampina is committed to tackling mal..."
4,Role of Health Systems in Ensuring Nutritional...,"""Health systems and their services are vital i...",Collaboration with Health Authorities,"""We collaborate with health authorities to ens..."
5,COVID-19 Impact on Nutrition,"""The COVID-19 pandemic has led to multiple eco...",Responding to COVID-19 Challenges,"""During the COVID-19 pandemic, we have worked ..."
6,Affordability of Healthy Diets,"""The cost and affordability of healthy diets a...",Making Nutritious Products Affordable,"""We strive to make our nutritious products aff..."
7,Nutrition-sensitive Social Protection Systems,"""Nutrition-sensitive social protection systems...",Social Responsibility and Nutritional Outreach,"""Our social responsibility programs focus on p..."
8,Promoting Healthy Food Environments,"""Food environment policies that foster food sy...",Creating Healthy Food Environments,"""We are dedicated to creating healthy food env..."
9,Dietary Patterns and Consumer Behavior,"""Many elements of the food environment determi...",Influencing Consumer Behavior Towards Healthy ...,"""Through marketing and consumer education, we ..."


In [439]:
def GPT_numbers(quotes):
    gpt_numbers = []
    for quote in quotes:
        if pd.isna(quote):
            gpt_numbers.append(None)
            continue
        
        # Find all numbers in brackets
        numbers = re.findall("p\. (\d+)", quote)
        numbers = [int(num) for num in numbers]

        #it can find more than one number but it will always take the first one
        gpt_numbers.append(numbers[0])
    return gpt_numbers

In [440]:
def clean_text(quotes):
    cleaned_quotes = []
    for quote in quotes:
        if pd.isna(quote):
            cleaned_quotes.append(None)  # Placeholder for NaN case
            continue
        
        # Remove text between brackets
        regularExp = re.compile(r'(\(|\[|\{)[A-Za-z\ ].+(\)|\]|\})')
        quote = regularExp.sub('', quote)
        
        # Strip, lower, replace
        quote = quote.strip().lower().replace("\n","")
        
        # Remove punctuation
        quote = re.sub(r'[^\w\s]', ' ', quote)
        
        # Reduce multiple spaces to single space
        quote = re.sub("\s\s+", " ", quote)
        
        cleaned_quotes.append(quote)
    
    return cleaned_quotes

In [441]:
# def find_quotes_in_pdf(report, cleaned_quotes):
#     found_numbers = []
#     for quote in cleaned_quotes:
#         if pd.isna(quote):
#             found_numbers.append(None)   # Placeholder for NaN case
#             continue
    
#         reader = PyPDF2.PdfReader("Reports/"+report)
#         NumPages = len(reader.pages)
    
#         #flag
#         quote_found = False
        
#         # Extract text and do the search
#         for i in range(0, NumPages):
#             PageObj = reader.pages[i]
#             Extract = PageObj.extract_text()
#             Text = Extract.lower()
#             Text = Text.replace("\n","")
#             Text = re.sub(r'[^\w\s]', ' ', Text)
#             Text = re.sub("\s\s+", " ", Text) 
    
#             if re.search(quote,Text):
#                 found_numbers.append(i + 1)
#                 quote_found = True
#                 break
    
#         if quote_found == False:
#             found_numbers.append(None)  
    
#     return found_numbers

In [442]:
def find_quotes_in_pdf(report, cleaned_quotes):
    found_numbers = [None] * len(cleaned_quotes)  # Initialize with None for each quote
    reader = PyPDF2.PdfReader("Reports/" + report)
    NumPages = len(reader.pages)

    for i in range(NumPages):
        PageObj = reader.pages[i]
        Extract = PageObj.extract_text()
        Text = Extract.lower()
        Text = Text.replace("\n", "")
        Text = re.sub(r'[^\w\s]', ' ', Text)
        Text = re.sub("\s\s+", " ", Text)
        
        for j in range(len(cleaned_quotes)):
            quote = cleaned_quotes[j]
            if pd.isna(quote):
                continue  # Skip NaN quotes
            
            if re.search(quote, Text) and found_numbers[j] is None:
                found_numbers[j] = i + 1  # Record the page number

    return found_numbers

In [443]:
def check_pages(row, page_GPT_name, page_found_name):
    if pd.notna(row[page_GPT_name]) and pd.notna(row[page_found_name]):
        if row[page_GPT_name] == row[page_found_name]:
            return "Same Page"
        else:
            return "Different Page"
    else:
        return "Not Found"

In [444]:
traceTable = df.copy()

In [445]:
#RN we are searching every page over and over, maybe it would be better to just open the pager and search every quote inside of it. 

In [446]:
# List all files in the Reports folder
reports = os.listdir("Reports")

for report in reports:
    for column in df.iloc[:, [1, 3]]:
        #get GPT number
        gpt_numbers = GPT_numbers(df[column])

        #clean quotes
        quotes = clean_text(df[column])

        #find quotes
        found_numbers = find_quotes_in_pdf(report, quotes)
        
        ###################################################
        #insert all results into df traceTable

        page_GPT_name = column + " GPT Page Number"
        if page_GPT_name not in traceTable.columns:
            insert_pos = traceTable.columns.get_loc(column) + 1
            traceTable.insert(insert_pos, page_GPT_name, gpt_numbers)

        page_found_name = report + " " + column + " Found Page Number"
        insert_pos = traceTable.columns.get_loc(column) + 2
        traceTable.insert(insert_pos, page_found_name, found_numbers)

        checker_name = 'Checker ' + report + " " + column
        insert_pos = traceTable.columns.get_loc(column) + 3
        traceTable.insert(insert_pos, checker_name, traceTable.apply(lambda row: check_pages(row, page_GPT_name, page_found_name), axis=1))

        print(traceTable[checker_name].value_counts().to_string() + "\n")

Checker FAO-2021.pdf Report#1_Quote
Different Page    11
Not Found          7

Checker FAO-2021.pdf Report#2_Quote
Not Found    18

Checker FrieslandCampina-Annual-Report-2021.pdf Report#1_Quote
Not Found    18

Checker FrieslandCampina-Annual-Report-2021.pdf Report#2_Quote
Not Found    14
Same Page     4



In [447]:
traceTable

Unnamed: 0,Report#1,Report#1_Quote,Report#1_Quote GPT Page Number,FrieslandCampina-Annual-Report-2021.pdf Report#1_Quote Found Page Number,Checker FrieslandCampina-Annual-Report-2021.pdf Report#1_Quote,FAO-2021.pdf Report#1_Quote Found Page Number,Checker FAO-2021.pdf Report#1_Quote,Report#2,Report#2_Quote,Report#2_Quote GPT Page Number,FrieslandCampina-Annual-Report-2021.pdf Report#2_Quote Found Page Number,Checker FrieslandCampina-Annual-Report-2021.pdf Report#2_Quote,FAO-2021.pdf Report#2_Quote Found Page Number,Checker FAO-2021.pdf Report#2_Quote
0,Impact of Food Systems on Health and Well-being,"""Food systems may exert both positive and nega...",112,,Not Found,24.0,Different Page,Focus on Nutritional Value of Products,"""We aim to produce nutritious and high-quality...",24.0,,Not Found,,Not Found
1,Unhealthy Diets as a Key Risk Factor for Globa...,"""Unhealthy diets are among the key risk factor...",112,,Not Found,138.0,Different Page,Addressing Lifestyle Diseases,"""We are actively engaged in addressing lifesty...",26.0,,Not Found,,Not Found
2,Importance of Nutritional Quality in Food Prod...,"""The nutritional quality of foods produced and...",113,,Not Found,139.0,Different Page,Product Reformulation for Nutritional Improvement,"""We have reformulated several of our products ...",29.0,,Not Found,,Not Found
3,Malnutrition and Food Insecurity,"""Malnutrition in all its forms increases susce...",113,,Not Found,139.0,Different Page,Tackling Malnutrition,"""FrieslandCampina is committed to tackling mal...",31.0,,Not Found,,Not Found
4,Role of Health Systems in Ensuring Nutritional...,"""Health systems and their services are vital i...",112,,Not Found,24.0,Different Page,Collaboration with Health Authorities,"""We collaborate with health authorities to ens...",33.0,,Not Found,,Not Found
5,COVID-19 Impact on Nutrition,"""The COVID-19 pandemic has led to multiple eco...",33,,Not Found,,Not Found,Responding to COVID-19 Challenges,"""During the COVID-19 pandemic, we have worked ...",36.0,,Not Found,,Not Found
6,Affordability of Healthy Diets,"""The cost and affordability of healthy diets a...",24,,Not Found,51.0,Different Page,Making Nutritious Products Affordable,"""We strive to make our nutritious products aff...",38.0,,Not Found,,Not Found
7,Nutrition-sensitive Social Protection Systems,"""Nutrition-sensitive social protection systems...",113,,Not Found,,Not Found,Social Responsibility and Nutritional Outreach,"""Our social responsibility programs focus on p...",40.0,,Not Found,,Not Found
8,Promoting Healthy Food Environments,"""Food environment policies that foster food sy...",113,,Not Found,,Not Found,Creating Healthy Food Environments,"""We are dedicated to creating healthy food env...",43.0,,Not Found,,Not Found
9,Dietary Patterns and Consumer Behavior,"""Many elements of the food environment determi...",113,,Not Found,131.0,Different Page,Influencing Consumer Behavior Towards Healthy ...,"""Through marketing and consumer education, we ...",45.0,,Not Found,,Not Found


In [449]:
traceTable.to_csv

<bound method NDFrame.to_csv of                                              Report#1  \
0     Impact of Food Systems on Health and Well-being   
1   Unhealthy Diets as a Key Risk Factor for Globa...   
2   Importance of Nutritional Quality in Food Prod...   
3                    Malnutrition and Food Insecurity   
4   Role of Health Systems in Ensuring Nutritional...   
5                        COVID-19 Impact on Nutrition   
6                      Affordability of Healthy Diets   
7       Nutrition-sensitive Social Protection Systems   
8                 Promoting Healthy Food Environments   
9              Dietary Patterns and Consumer Behavior   
10                  Breastfeeding and Child Nutrition   
11                   Impact of food systems on health   
12                       Health risks from poor diets   
13      Importance of breastfeeding and child feeding   
14                          Food environment policies   
15     Nutrition-responsive social protection systems   

In [425]:
# # # List all files in the Reports folder
# reports = os.listdir("Reports")

# for report in reports:
# ################################

#     for column in df.iloc[:, [1, 3]]:

#     #     traceTable[column + " Cleaned"] = 0
        
#         found_numbers = []
#         quotes = []
#         gpt_numbers = []

#         for quote in df[column]:

#             #Formatting of quote to effectively search (removal random characters, punctuation and extra spaces)
#             #check length
#             # print(quote)

#             if pd.isna(quote):
#                 gpt_numbers.append("NaN")
#                 quotes.append(quote)
#                 continue

#             # Find all numbers in brackets
#             numbers = re.findall("p\. (\d+)", quote)
#             #I dont like this re, but it works for now

#             # Convert to integers if needed
#             numbers = [int(num) for num in numbers]

#             gpt_numbers.extend(numbers)

#             #pattern for text between brackets
#             regularExp = re.compile('(\(|\[|\{)[A-Za-z\ ].+(\)|\]|\})')

#             #remove the text found using previous pattern
#             quote = regularExp.sub('', quote)

#             #strip, lower, replace
#             quote = quote.strip().lower().replace("\n","")

#             #removes all punctuation
#             quote = re.sub(r'[^\w\s]', ' ', quote)

#             #removes all spaces that are not single
#             quote = re.sub("\s\s+", " ", quote)

#             #add the quotes to a list
#             quotes.append(quote)
            
#             ############################################
#             reader = PyPDF2.PdfReader("Reports/"+report)
#             NumPages = len(reader.pages)

#             #flag
#             quote_found = False
#             print(quote)
            
#             # Extract text and do the search
#             for i in range(0, NumPages):
#                 PageObj = reader.pages[i]
#                 Extract = PageObj.extract_text()
#                 Text = Extract.lower()
#                 Text = Text.replace("\n","")
#                 Text = re.sub(r'[^\w\s]', ' ', Text)
#                 Text = re.sub("\s\s+", " ", Text) 

#                 if re.search(quote,Text):
#                     found_numbers.append(i + 1)
#                     print(found_numbers)
#                     quote_found = True
#                     break

#             if not quote_found:
#                 found_numbers.append("NaN")  
#                 print(found_numbers)
                
#             ##############################################
        
#         #index of insert position
#         insert_pos = traceTable.columns.get_loc(column) + 1
        
#         traceTable.insert(insert_pos, report + column + "GPT Page Number", gpt_numbers)
        
#         insert_pos = traceTable.columns.get_loc(column) + 2
        
#         traceTable.insert(insert_pos, report + column + "Found Page Number", found_numbers)

In [426]:
# import os
# import re
# import pandas as pd
# import PyPDF2

# # Function to clean and process quotes
# def clean_quotes(quotes):
#     cleaned_quotes = []
#     for quote in quotes:
#         if pd.isna(quote):
#             cleaned_quotes.append(([], quote))  # Placeholder for NaN case
#             continue
        
#         # Find all numbers in brackets
#         numbers = re.findall("p\. (\d+)", quote)
#         numbers = [int(num) for num in numbers]
        
#         # Remove text between brackets
#         regularExp = re.compile(r'(\(|\[|\{)[A-Za-z\ ].+(\)|\]|\})')
#         quote = regularExp.sub('', quote)
        
#         # Strip, lower, replace
#         quote = quote.strip().lower().replace("\n","")
        
#         # Remove punctuation
#         quote = re.sub(r'[^\w\s]', ' ', quote)
        
#         # Reduce multiple spaces to single space
#         quote = re.sub("\s\s+", " ", quote)
        
#         cleaned_quotes.append((numbers, quote))
    
#     return cleaned_quotes

# # Function to find quotes in PDF and extract page numbers
# def find_quotes_in_pdf(report, cleaned_quotes):
#     found_numbers = []
#     for numbers, quote in cleaned_quotes:
#         quote_found = False
#         reader = PyPDF2.PdfReader("Reports/" + report)
#         NumPages = len(reader.pages)
        
#         for i in range(NumPages):
#             PageObj = reader.pages[i]
#             Extract = PageObj.extract_text()
#             Text = Extract.lower().replace("\n","")
#             Text = re.sub(r'[^\w\s]', ' ', Text)
#             Text = re.sub("\s\s+", " ", Text)
            
#             if re.search(quote, Text):
#                 found_numbers.append(i + 1)
#                 quote_found = True
#                 break
        
#         if not quote_found:
#             found_numbers.append("NaN")
    
#     return found_numbers

# # List all files in the Reports folder
# reports = os.listdir("Reports")

# for report in reports:
#     for column in df.iloc[:, [1, 3]]:
#         quotes = list(df[column])
        
#         # Clean and process quotes
#         cleaned_quotes = clean_quotes(quotes)
        
#         for nums, quote in cleaned_quotes:
#             gpt_numbers.extend(nums)
#             quotes.append(quote)
        
#         # Find quotes in PDF and extract page numbers
#         found_numbers = find_quotes_in_pdf(report, cleaned_quotes)
        
#         # Insert into traceTable
#         insert_pos = traceTable.columns.get_loc(column) + 1
#         traceTable.insert(insert_pos, report + column + "GPT Page Number", gpt_numbers)
        
#         insert_pos = traceTable.columns.get_loc(column) + 2
#         traceTable.insert(insert_pos, report + column + "Found Page Number", found_numbers)

In [427]:
# traceTable

In [428]:
# #Importing PDF and searching for quote
# import os

# # # List all files in the Reports folder
# reports = os.listdir("Reports")

# for report in reports:
#     page_numbers = []
#     for quote in traceTable['Supporting Quote from FrieslandCampina Report']:

#         print(quote)

#         if pd.isna(quote):
#             page_numbers.append("NaN")
#             continue

#         reader = PyPDF2.PdfReader("Reports/"+report)
#         NumPages = len(reader.pages)

#         #flag
#         quote_found = False

#         # Extract text and do the search
#         for i in range(0, NumPages):
#             PageObj = reader.pages[i]
#             Extract = PageObj.extract_text()
#             Text = Extract.lower()
#             Text = Text.replace("\n","")
#             Text = re.sub(r'[^\w\s]', ' ', Text)
#             Text = re.sub("\s\s+", " ", Text) 

#             if re.search(quote,Text):
#                 page_numbers.append(i + 1)
#                 print(page_numbers)
#                 quote_found = True
#                 break

#         if not quote_found:
#             page_numbers.append("NaN")  
#             print(page_numbers)

#     traceTable["PageFound"] = page_numbers

In [429]:
# print(page_numbers)

In [430]:
# traceTable