# Import dependencies

In [None]:
!pip install pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pdfminer.six
Successfully installed pdfminer.six-20231228


In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
from pdfminer.high_level import extract_text
import re
import json

Mounted at /content/drive


# Key functions

A big portion of our data comes in a pdf format. While packages like pdfminer do a great job of extracting the text data, some pdf artifacts inevitably remain.

In [None]:
def extract_text_from_pdf(file_path):
    """
    Extracts text from a PDF file.

    Args:
    file_path (str): The file path of the PDF from which to extract text.

    Returns:
    str: The extracted text.
    """
    try:
        text = extract_text(file_path)
        return text
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

We start by removing tab characters (\t), form-feed characters (x0c, /f), and we further merge word divisions that occur due to newlines (- \n, - ).

Further, while we currently only consider textual data (i.e., we remove any figures, tables or boxes) some sentences that refer to non-textual data are nevertheless useful, as they sometimes present a short-summary of the graphical representation they refer to. In any case, direct references to figures or tables are obviously not something we need our chatbot to learn - we therefore substitute any mention to [Figures, Tables, Boxes] with "the data".

In [None]:
def clean_text(text):
    # Remove tab characters, word divisions (that occur because of newlines), and some special characters
    text = text.replace("\t", " ")
    text = text.replace(" \x0c", "")
    text = text.replace("x0c", "")
    text = text.replace("\f", "")
    text = text.replace("- \n","")
    text = text.replace("- ","")

    # Replace 'Figure' references
    # Inside brackets: Replace with "as shown by the data"
    text = re.sub(r'\((Figures? \d+(\.\d+)*)\)', "as shown by the data", text)
    # Outside brackets: Replace with "the data"
    text = re.sub(r'Figures? \d+(\.\d+)*', "the data", text)

    # Replace 'Box' references
    # Inside brackets: Replace with "as shown by the data"
    text = re.sub(r'\((Box \d+(\.\d+)*)\)', "as shown by the data", text)
    # Outside brackets: Replace with "the data"
    text = re.sub(r'Box \d+(\.\d+)*', "the data", text)

    # Replace 'Table' references
    # Inside brackets: Replace with "as shown by the data"
    text = re.sub(r'\((Tables? \d+(\.\d+)*)\)', "as shown by the data", text)
    # Outside brackets: Replace with "the data"
    text = re.sub(r'Tables? \d+(\.\d+)*', "the data", text)

    return text

def split_into_paragraphs(text):
    # Split the text into paragraphs at every occurrence of two newline characters
    paragraphs = text.split('\n\n')

    # Optional: Trim whitespace from each paragraph
    paragraphs = [para.strip() for para in paragraphs if para.strip()]
    cleaned_paragraphs = [paragraph.replace('\n', ' ').strip() for paragraph in paragraphs if paragraph.strip()]
    cleaned_paragraphs = [paragraph.replace('  ', ' ').strip() for paragraph in cleaned_paragraphs if paragraph.strip()]
    # Sometimes, paragraphs get broken up by a new page - we identify such cases by looking at paragraphs that
    # do not end in classical end tokens, such as [. ! ?], and append to them the subsequent paragraph.
    merged_paragraphs = []
    paragraph_to_merge = ""
    for paragraph in cleaned_paragraphs:
        # Check if paragraph ends with ., ?, or !
        if paragraph and paragraph[-1] in {'.', '?', '!'}:
            # If there's a paragraph to merge, merge it first
            if paragraph_to_merge:
                merged_paragraphs.append(paragraph_to_merge + " " + paragraph)
                paragraph_to_merge = ""
            else:
                merged_paragraphs.append(paragraph)
        else:
            # Append this paragraph to the paragraph_to_merge
            if paragraph_to_merge:
                paragraph_to_merge += " " + paragraph
            else:
                paragraph_to_merge = paragraph

    # Add the last paragraph if it hasn't been added yet
    if paragraph_to_merge:
        merged_paragraphs.append(paragraph_to_merge)

    # Calculate lengths of final paragraphs
    paragraph_lengths_final = [len(paragraph.split()) for paragraph in merged_paragraphs]
    return merged_paragraphs, paragraph_lengths_final

# Example usage


In [None]:
file_path = 'example.pdf'
extracted_text = extract_text_from_pdf(file_path)
cleaned_text = clean_text(extracted_text)
paragraphs, paragraph_lengths = split_into_paragraphs(cleaned_text)

A lot of our data sources have a specific target group or region in mind - we would like to preserve this information when generating summaries and questions, and therefore append region or target-group specific tags to the start of each paragraph.

In [None]:
with open("/content/drive/MyDrive/Climate Change AIctivist/data/GEO_Youth_paragraphs.txt", 'w') as file:
    for string in paragraphs:
        tagged_paragraph = "[TARGET_GROUP_TAG] " + string
        file.write(tagged_paragraph + '\n')

# Concatenate


In [None]:
import os

directory = '/content/drive/MyDrive/Climate Change AIctivist/data'  # Replace with your directory path
all_text = ""

# Read each .txt file and concatenate the content
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        with open(os.path.join(directory, filename), 'r') as file:
            all_text += file.read() + "\n"

# Save the concatenated content into a new file
with open(os.path.join("/content/drive/MyDrive/Climate Change AIctivist/all_data", 'all_paragraphs.txt'), 'w') as file:
    file.write(all_text)