# Python Code
### Automation of validating quotes given by generative AI output to identify page number

In [1]:
#Importing packages for search functions
import PyPDF2
import re
import pandas as pd
import os

In [2]:
# Read the CSV file into a DataFrame
outputs = os.listdir("ComparisonTables")
print(outputs)

dataframes = []

common_columns = [
    'Report#1',
    'Report#1_Quote',
    'Report#2',
    'Report#2_Quote'
]

# Read each CSV file into a DataFrame, rename the columns, and append to the list
for output in outputs:
    df = pd.read_csv("ComparisonTables/" + output)
    df.columns = common_columns[:len(df.columns)]  # Rename columns to the common names
    dataframes.append(df)

# Combine all DataFrames into one
df = pd.concat(dataframes, ignore_index=True)

# # FOR TESTING
# df = df.iloc[0:3]

['ComparisonTable.csv', 'ComparisonTable2.csv']


In [3]:
df

Unnamed: 0,Report#1,Report#1_Quote,Report#2,Report#2_Quote
0,Impact of Food Systems on Health and Well-being,"""Food systems may exert both positive and nega...",Focus on Nutritional Value of Products,"""We aim to produce nutritious and high-quality..."
1,Unhealthy Diets as a Key Risk Factor for Globa...,"""Unhealthy diets are among the key risk factor...",Addressing Lifestyle Diseases,"""We are actively engaged in addressing lifesty..."
2,Importance of Nutritional Quality in Food Prod...,"""The nutritional quality of foods produced and...",Product Reformulation for Nutritional Improvement,"""We have reformulated several of our products ..."
3,Malnutrition and Food Insecurity,"""Malnutrition in all its forms increases susce...",Tackling Malnutrition,"""FrieslandCampina is committed to tackling mal..."
4,Role of Health Systems in Ensuring Nutritional...,"""Health systems and their services are vital i...",Collaboration with Health Authorities,"""We collaborate with health authorities to ens..."
5,COVID-19 Impact on Nutrition,"""The COVID-19 pandemic has led to multiple eco...",Responding to COVID-19 Challenges,"""During the COVID-19 pandemic, we have worked ..."
6,Affordability of Healthy Diets,"""The cost and affordability of healthy diets a...",Making Nutritious Products Affordable,"""We strive to make our nutritious products aff..."
7,Nutrition-sensitive Social Protection Systems,"""Nutrition-sensitive social protection systems...",Social Responsibility and Nutritional Outreach,"""Our social responsibility programs focus on p..."
8,Promoting Healthy Food Environments,"""Food environment policies that foster food sy...",Creating Healthy Food Environments,"""We are dedicated to creating healthy food env..."
9,Dietary Patterns and Consumer Behavior,"""Many elements of the food environment determi...",Influencing Consumer Behavior Towards Healthy ...,"""Through marketing and consumer education, we ..."


In [4]:
def GPT_numbers(quotes):
    gpt_numbers = []
    for quote in quotes:
        if pd.isna(quote):
            gpt_numbers.append(None)
            continue
        
        # Find all numbers in brackets
        numbers = re.findall("p\. (\d+)", quote)
        numbers = [int(num) for num in numbers]

        #it can find more than one number but it will always take the first one
        gpt_numbers.append(numbers[0])
    return gpt_numbers

In [5]:
def clean_text(quotes):
    cleaned_quotes = []
    for quote in quotes:
        if pd.isna(quote):
            cleaned_quotes.append(None)  # Placeholder for NaN case
            continue
        
        # Remove text between brackets
        regularExp = re.compile(r'(\(|\[|\{)[A-Za-z\ ].+(\)|\]|\})')
        quote = regularExp.sub('', quote)
        
        # Strip, lower, replace
        quote = quote.strip().lower().replace("\n","")
        
        # Remove punctuation
        quote = re.sub(r'[^\w\s]', ' ', quote)
        
        # Reduce multiple spaces to single space
        quote = re.sub("\s\s+", " ", quote)
        
        cleaned_quotes.append(quote)
    
    return cleaned_quotes

In [6]:
def find_quotes_in_pdf(report, cleaned_quotes):
    found_numbers = [None] * len(cleaned_quotes)  # Initialize with None for each quote
    reader = PyPDF2.PdfReader("Reports/" + report)
    NumPages = len(reader.pages)

    for i in range(NumPages):
        PageObj = reader.pages[i]
        Extract = PageObj.extract_text()
        Text = Extract.lower()
        Text = Text.replace("\n", "")
        Text = re.sub(r'[^\w\s]', ' ', Text)
        Text = re.sub("\s\s+", " ", Text)
        
        for j in range(len(cleaned_quotes)):
            quote = cleaned_quotes[j]
            if pd.isna(quote):
                continue  # Skip NaN quotes
            
            if re.search(quote, Text) and found_numbers[j] is None:
                found_numbers[j] = i + 1  # Record the page number

    return found_numbers

In [7]:
def check_pages(row, page_GPT_name, page_found_name):
    if pd.notna(row[page_GPT_name]) and pd.notna(row[page_found_name]):
        if row[page_GPT_name] == row[page_found_name]:
            return "Same Page"
        else:
            return "Different Page"
    else:
        return "Not Found"

In [8]:
traceTable = df.copy()

# Results

In [9]:
# List all files in the Reports folder
reports = os.listdir("Reports")

for report in reports:
    for column in df.iloc[:, [1, 3]]:
        #get GPT number
        gpt_numbers = GPT_numbers(df[column])

        #clean quotes
        quotes = clean_text(df[column])

        #find quotes
        found_numbers = find_quotes_in_pdf(report, quotes)
        
        ###################################################
        #insert all results into df traceTable

        page_GPT_name = column + " GPT Page Number"
        if page_GPT_name not in traceTable.columns:
            insert_pos = traceTable.columns.get_loc(column) + 1
            traceTable.insert(insert_pos, page_GPT_name, gpt_numbers)

        page_found_name = report + " " + column + " Found Page Number"
        insert_pos = traceTable.columns.get_loc(column) + 2
        traceTable.insert(insert_pos, page_found_name, found_numbers)

        checker_name = 'Checker ' + report + " " + column
        insert_pos = traceTable.columns.get_loc(column) + 3
        traceTable.insert(insert_pos, checker_name, traceTable.apply(lambda row: check_pages(row, page_GPT_name, page_found_name), axis=1))

        print(traceTable[checker_name].value_counts().to_string() + "\n")