# 1. PDF to Text Converter

In [None]:
import os
import PyPDF2

# --- 1. Path Configuration ---
# Get the current working directory, which is the 'src' folder.
current_dir = os.getcwd()

# Move up one level to the project's root directory.
base_dir = os.path.dirname(current_dir)

# Define the path to the data folder, which is expected to contain PDF files.
data_folder_path = os.path.join(base_dir, 'data')

# Define the input directory containing PDF files.
pdf_folder_path = os.path.join(data_folder_path, 'pdf')

# Define the output directory where the extracted text files will be saved.
txt_save_path = os.path.join(data_folder_path, 'txt')

# --- 2. Directory Setup ---
# Check if the output directory exists. If not, create it to avoid errors.
if not os.path.exists(txt_save_path):
    os.makedirs(txt_save_path)
    print(f"Created a new folder: '{txt_save_path}'.")

# --- 3. PDF File Handling ---
# Get a list of all files in the specified PDF folder.
# This part includes error handling for cases where the folder path is invalid.
try:
    file_list = os.listdir(pdf_folder_path)
    # Filter the list to include only files with a '.pdf' extension.
    pdf_files = [f for f in file_list if f.endswith('.pdf')]
except FileNotFoundError:
    print(f"Error: PDF folder path not found. Please check the path: '{pdf_folder_path}'.")
    pdf_files = []

# --- 4. Main Processing Loop ---
# Iterate through each identified PDF file to perform text extraction.
for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder_path, pdf_file)
    # Create the corresponding output filename with a '.txt' extension.
    txt_path = os.path.join(txt_save_path, os.path.splitext(pdf_file)[0] + '.txt')

    try:
        # Open the PDF file in binary read mode ('rb').
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ''
            
            # Loop through every page in the PDF and concatenate the extracted text.
            for page in reader.pages:
                # Add extracted text. Use 'or '' ' to handle pages that may not have text.
                text += page.extract_text() or ''
            
            # Write the final extracted text to a new TXT file.
            with open(txt_path, 'w', encoding='utf-8') as output_file:
                output_file.write(text)
        
        print(f"Text successfully extracted from '{pdf_file}' and saved to '{os.path.basename(txt_path)}'.")
        
    except Exception as e:
        # Print an error message if any issue occurs during the process for a specific file.
        print(f"An error occurred while processing the file '{pdf_file}': {e}")

# --- 5. Completion Message ---
print("\nAll PDF files processed.")

# 2. Text Preprocessing

In [None]:
import os
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

# --- 1. Path Configuration ---
# Get the current working directory, which is the 'src' folder.
current_dir = os.getcwd()

# Move up one level to the project's root directory.
base_dir = os.path.dirname(current_dir)

# Define the path to the 'data' folder from the project's root.
data_folder_path = os.path.join(base_dir, 'data')

# Path to the custom stopwords file.
custom_stopwords_path = os.path.join(data_folder_path, 'stop_words_english.txt')

# Set the input and output directory paths for processing.
input_directory = os.path.join(data_folder_path, 'txt')
output_directory = os.path.join(data_folder_path, 'txt2')

# --- 2. NLTK Setup ---
# This script requires the following NLTK resources.
# Uncomment and run these lines once to download the necessary data.
nltk.download('stopwords')
nltk.download('wordnet')

# --- 3. Preprocessing Tools Initialization ---
# Initialize the set of standard English stopwords from NLTK.
stop = set(stopwords.words('english'))
# Initialize the WordNet Lemmatizer to reduce words to their base form.
lemma = WordNetLemmatizer()
# Define a set of punctuation marks to be excluded.
exclude = set(string.punctuation)
# Add domain-specific stopwords to the default list.
stop.update(['fig', 'table', 'example', 'figure', 'article', 'history', 'abstract', 'keywords', 'research'])

# --- 4. Custom Stopwords Function ---
# This function reads custom stopwords from a specified file and adds them to the main set.
def add_custom_stopwords(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            custom_stopwords = file.read().split()
        stop.update(custom_stopwords)
    except FileNotFoundError:
        print(f"Custom stopwords file not found: {file_path}")
        print("Continuing without custom stopwords.")

add_custom_stopwords(custom_stopwords_path)

# --- 5. Text Cleaning Function ---
# This function applies a series of cleaning and preprocessing steps to a given text.
def clean_text(text):
    # Standardize the text by converting to lowercase and replacing unwanted characters/patterns.
    text = text.lower().replace('\n', ' ').replace('\x0c', ' ').replace('  ', ' ').replace('a b s t r a c t', 'abstract ').replace('k e y w o r d s', 'abstract ').replace('2020 Elsevier Ltd. All rights reserved', '').replace('2021 Published by Elsevier Ltd.', '').replace('2021 Elsevier B.V. All rights reserved.', '').replace('2020 Elsevier Ltd. All rights reserved.', '').replace('2021 Elsevier Ltd. All rights reserved.', '').replace('  ', '').replace('  ', '')
    
    # Remove numbers, decimals, and units (e.g., '1.2km', '100m').
    text = re.sub(r'\b\d*\.?\d+[\w/]*\b', '', text)
    
    # Tokenize the text into words and filter out stopwords.
    # The regex pattern finds words with at least two letters.
    words = re.findall(r'\b[a-zA-Z]{2,}\b|\b(?<!\S)r(?!\S)\b', text)
    words = [word for word in words if word not in stop]
    
    # Apply lemmatization to reduce words to their base form.
    words = [lemma.lemmatize(word) for word in words]
    cleaned_text = ' '.join(words)
    
    # Remove specific special characters, symbols, and patterns like 'et al'.
    cleaned_text = re.sub(r'·|[〠ð—©©©©]|et al|', '', cleaned_text)
    
    # Remove Greek letters and mathematical symbols.
    cleaned_text = re.sub(r'[\u0370-\u03FF\u2200-\u22FF]+', '', cleaned_text)
    return cleaned_text

# --- 6. Main Processing Function ---
# This function iterates through all '.txt' files in a given directory,
# processes each file using `clean_text`, and saves the output to a new directory.
def process_and_save_text_files(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    try:
        for file in os.listdir(input_dir):
            if file.endswith(".txt"):
                input_path = os.path.join(input_dir, file)
                output_path = os.path.join(output_dir, file)

                with open(input_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    cleaned_text = clean_text(content)

                with open(output_path, 'w', encoding='utf-8') as f:
                    f.write(cleaned_text)
                print(f"[Save Complete] {file} -> {output_path}")
    except FileNotFoundError:
        print(f"Error: Input directory not found. Please check the path: '{input_dir}'.")

# --- 7. Execution ---
# Call the main function to start the text processing.
process_and_save_text_files(input_directory, output_directory)

# 3. Text Lemmatizer

In [None]:
import os
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

# --- 1. Path Configuration ---
# Get the current working directory, which is the 'src' folder.
current_dir = os.getcwd()

# Move up one level to the project's root directory.
base_dir = os.path.dirname(current_dir)

# Define the path to the 'data' folder from the project's root.
data_folder_path = os.path.join(base_dir, 'data')

# This is where NLTK will look for its resources, such as 'wordnet'
nltk_data_path = os.path.join(data_folder_path, 'nltk_data')

# Set the input and output directory paths for processing.
input_directory = os.path.join(data_folder_path, 'txt2')
output_directory = os.path.join(data_folder_path, 'txt3')

# --- 2. NLTK Setup ---
# Set the NLTK data path to ensure resources are loaded correctly.
if nltk_data_path not in nltk.data.path:
    nltk.data.path.append(nltk_data_path)

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    print("No NLTK resource. Downloading now.")
    nltk.download('wordnet', download_dir=nltk_data_path)
    nltk.data.path.append(nltk_data_path)

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    print("No NLTK resource. Downloading now.")
    nltk.download('stopwords', download_dir=nltk_data_path)
    nltk.data.path.append(nltk_data_path)

# --- 3. Main Processing Function ---
def process_and_save_files_manual_pos(input_folder, output_folder):
    """
    This function reads text files from a specified folder,
    tokenizes and lemmatizes the words, and saves the cleaned text to a new folder.
    It primarily tries to lemmatize words as verbs, then as nouns if no change occurs.
    """
    try:
        # Initialize necessary tools for text processing.
        stop_words = set(stopwords.words('english'))
        lemmatizer = WordNetLemmatizer()
        # Use RegexpTokenizer to extract only words, ignoring punctuation.
        tokenizer = RegexpTokenizer(r'\b\w+\b')
    except LookupError:
        print("Error: Required NLTK resources could not be loaded. Please ensure they are downloaded.")
        return

    # Validate that the input folder exists.
    if not os.path.isdir(input_folder):
        print(f"Error: The specified input path '{input_folder}' was not found.")
        return

    # Create the output directory if it doesn't already exist.
    os.makedirs(output_folder, exist_ok=True)
    print(f"Processed files will be stored in '{output_folder}'")

    # Iterate through all files in the input directory.
    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            input_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, filename)

            print(f"--- Processing file: {filename} ---")
            
            try:
                with open(input_file_path, 'r', encoding='utf-8') as infile:
                    text = infile.read()

                    # Tokenize the text into words and convert to lowercase.
                    tokens = tokenizer.tokenize(text.lower())
                    
                    lemmatized_words = []
                    for word in tokens:
                        # Process only alphabetic words that are not in the stop words list.
                        if word.isalpha() and word not in stop_words:
                            # 1. Attempt to lemmatize the word as a verb.
                            lemmatized_word = lemmatizer.lemmatize(word, pos=wordnet.VERB)
                            # 2. If the verb lemmatization doesn't change the word, try lemmatizing as a noun.
                            if lemmatized_word == word:
                                lemmatized_word = lemmatizer.lemmatize(word, pos=wordnet.NOUN)
                            lemmatized_words.append(lemmatized_word)

                    processed_text = ' '.join(lemmatized_words)
                
                with open(output_file_path, 'w', encoding='utf-8') as outfile:
                    outfile.write(processed_text)
                    
                print(f"Success: '{filename}' has been processed and saved.\n")
            except Exception as e:
                print(f"Error processing file '{filename}': {e}")

if __name__ == "__main__":
    process_and_save_files_manual_pos(input_directory, output_directory)