<a href="https://colab.research.google.com/github/khalilDimassi/Datasci_Resources/blob/master/pdf_segementing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install PyMuPDF pdf2image transformers pprintpp ultralytics
!apt-get install poppler-utils

In [2]:
%%capture
# For computer vision and image processing
import cv2
from pdf2image import convert_from_path
from PIL import Image

# For working with PDFs
import fitz
from fitz import open as fitz_open

# For file and directory operations
import os

# For data manipulation and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# For text processing and utilities
from pprint import pprint
import re

# For progress tracking
from tqdm.notebook import tqdm

# For object detection
from ultralytics import YOLO

# For handling web requests and data encoding
import requests
import base64

In [3]:
tqdm.pandas()

---
# Mining
---

In [4]:
def extract_text_from_pdf(file_path, start=1, finish=None):
    with fitz.open(file_path) as doc:
        text = ''
        pages_count = range(start-1, finish or len(doc))
        for i in tqdm(pages_count):
            page = doc.load_page(i)
            text += page.get_text()
        return text

In [5]:
def convert_pdf_to_images(input_path, output_path):
    # Convert the PDF to a list of images (one image per page)
    images = convert_from_path(input_path)
    image_paths = []

    # Ensure the output directory exists
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    for i, image in tqdm(enumerate(images, start=1)):
        # Save each image as a file
        page_number = f'page_{i}'
        image_path = os.path.join(output_path, f'{page_number}'+'.jpg')
        image_paths.append(image_path)
        image.save(image_path, 'JPEG')

    return image_paths

In [6]:
def get_image_paths_from_directory(directory):
    image_paths = []

    for filename in os.listdir(directory):
        if filename.endswith(".jpg"):
            image_path = os.path.join(directory, filename)
            image_paths.append(image_path)

    return image_paths

In [7]:
def split_text(title_pattern, figure_pattern, raw_text):
    matches = list(re.finditer(title_pattern, raw_text))
    titles = ["0 Prologue"]
    matched_titles = [match.group(0) for match in matches]
    sections = re.split(title_pattern, raw_text)
    titles.extend([f"{matched_titles[i][3:]}" \
                   for i in tqdm(range(len(matched_titles)))])

    chapter_data = {}
    for i in tqdm(range(len(titles))):
        chapter_title = titles[i]
        chapter_content = sections[i]

        figure_matches = re.finditer(figure_pattern, chapter_content)
        figure_names = [re.sub(r'^A-Za-z0-9', '', match.group(0)) for match in figure_matches]

        chapter_data[chapter_title] = {
            "content": chapter_content,
            "figure_names": figure_names
        }

    return chapter_data

In [8]:
def edit_figures(figures_list):
  edited_figures = []
  for fig in figures_list:
    fig_type = 'Figure' if 'Figure' in fig else 'Table'
    fig_number = fig.split('-')[1][0]

    # Creating the new formatted figure string
    new_fig = f"{fig_type} {fig_number}"
    edited_figures.append(new_fig)

  return edited_figures

In [9]:
def cut_image(image_path, x_min, y_min, x_max, y_max, output_path):
    image = cv2.imread(image_path)
    cropped_image = image[y_min:y_max, x_min:x_max]
    cv2.imwrite(output_path, cropped_image, [cv2.IMWRITE_PNG_COMPRESSION, 0])

In [10]:
def process_images_and_cut_figures(image_paths, model_path, output_folder):
    figure_detector = YOLO(model_path)
    for i, path in enumerate(image_paths, start=1):
        image = Image.open(path)
        image_np = np.asarray(image)
        detected_figures = figure_detector.predict(image_np)
        if len(detected_figures[0].boxes.boxes) > 0:
            for box in detected_figures[0].boxes.boxes.numpy():
                x_min, y_min, x_max, y_max, _, _ = box
                cut_image(path, int(x_min), int(y_min), int(x_max), int(y_max), f'{output_folder}/figure{i}.png')

In [11]:
def extract_image_paths_from_text(text, base_path):
    pattern = r'11\.\d+ [A-Z][^\n]*'
    sentence_pattern = r"(Figure|Table) \d+-\d+\.\s[\w\s, &():]+\n"
    matches = re.finditer(pattern, text)
    image_paths = []
    for match in matches:
        end = match.end()
        content = text[end:text.find(pattern, end) if (next_match := re.search(pattern, text[end:])) else len(text)].strip()
        for m in re.finditer(sentence_pattern, content):
            figure_number = int(m.group(0).split("\n")[0].split(".")[0].split("-")[1])
            figure_number = figure_number + 1 if figure_number > 4 else 5 if figure_number == 1 else figure_number
            image_paths.append(f'{base_path}/figure{figure_number}.png')
    return image_paths

In [12]:
def upload_images(image_paths, client_id='33b5a13820a6e86', album_id='Y174OuB'):
    upload_url = 'https://api.imgur.com/3/image'
    headers = {'Authorization': f'Client-ID {client_id}'}
    uploaded_image_urls = []
    for path in image_paths:
        with open(path, 'rb') as image_file:
            image_data = base64.b64encode(image_file.read()).decode('utf-8')
            data = {
                'image': image_data,
                'type': 'base64',
                'album': album_id,
                'name': os.path.basename(path),
                'description': 'Uploaded to PMbok11figures for GNN project',
                'privacy': 'hidden'
            }
            response = requests.post(upload_url, data=data, headers=headers)
            if response.status_code == 200:
                uploaded_image_urls.append(response.json()['data']['link'])
    return uploaded_image_urls

In [13]:
# imgpaths = convert_pdf_to_images('/content/drive/MyDrive/Colab Notebooks/PMBOK ch11.pdf', '/content/drive/MyDrive/Colab Notebooks/PMbok11 images')

In [14]:
# Directory containing the images
image_directory = '/content/drive/MyDrive/Colab Notebooks/PMbok11 images'
image_paths = get_image_paths_from_directory(image_directory)

In [15]:
# save figures ref per chapter after first upload
figures_ref = [[],
              ['https://i.imgur.com/xL1nauQ.png',
               'https://i.imgur.com/VM8LfK6.png',
               'https://i.imgur.com/lhNw9ex.png',
               'https://i.imgur.com/IzoFdqL.png',
               'https://i.imgur.com/xX8TmBy.png'],
              ['https://i.imgur.com/aKyeTw9.png',
               'https://i.imgur.com/vOIfhgH.png'],
              ['https://i.imgur.com/fCvLDEU.png',
               'https://i.imgur.com/EVEEAzP.png',
               'https://i.imgur.com/DOHqVvU.png'],
              ['https://i.imgur.com/MEbfhsG.png',
               'https://i.imgur.com/ZULAIZP.png',
               'https://i.imgur.com/opFNt8N.png',
               'https://i.imgur.com/6LCNTGE.png',
               'https://i.imgur.com/WTe5ZHN.png'],
              ['https://i.imgur.com/5DgdGO4.png',
               'https://i.imgur.com/1jTY6eg.png'],
              ['https://i.imgur.com/i2aQozd.png',
               'https://i.imgur.com/TVyyzp2.png'],
              ['https://i.imgur.com/WBPlzLe.png',
               'https://i.imgur.com/56inH4u.png']]

In [16]:
title_pattern = r'11\.\d+ [A-Z][^\n]*'
figure_pattern = r"\n?\s?(Figure|Table)(?!\n)\s\d+-\d+\.\s(?!Figure)[A-Z][\w\s:,.&()-]+?\n"
pdf_file_path = '/content/drive/MyDrive/Colab Notebooks/PMBOK ch11.pdf'

pdf_text = extract_text_from_pdf(pdf_file_path, start=3)
chapter_data = split_text(title_pattern, figure_pattern, pdf_text)

  0%|          | 0/62 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

In [17]:
raw_df = pd.DataFrame([(title, data["content"], data["figure_names"]) for title, data in chapter_data.items()], columns=["Title", "Content", "Figures"])

In [18]:
raw_df['figures_ref'] = [ref for ref in figures_ref]

In [19]:
raw_df.sample()

Unnamed: 0,Title,Content,Figures,figures_ref
4,4 PERFORM QUANTITATIVE RISK ANALYSIS,\nPerform Quantitative Risk Analysis is the pr...,[\nFigure 11-11. Perform Quantitative Risk Ana...,"[https://i.imgur.com/MEbfhsG.png, https://i.im..."


---
# Cleaning
---

In [39]:
import nltk
import nltk.corpus
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import spacy

In [48]:
## prepare nltk modules:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
lemm_model = spacy.load('en_core_web_sm')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [49]:
raw_df_proc = raw_df.copy()

In [50]:
def remove_stop_words(text, custom_stop_words=None):
  words = text.split()
  stop_words = set(stopwords.words('english'))
  if custom_stop_words is not None:
      stop_words.update(custom_stop_words)
  filtered_words = [word for word in words if word not in stop_words]
  filtered_text = " ".join(filtered_words)
  return filtered_text

In [51]:
def lemmatize_text(text, model):
    doc = model(text)
    lemmatized_words = [token.lemma_ for token in doc]
    return " ".join(lemmatized_words)

In [69]:
def segment_clean_content(text, sub_pattern):
  matches = list(re.finditer(sub_pattern, text))
  titles = ["definition"]
  matched_titles = [match.group(0) for match in matches]
  sections = re.split(sub_pattern, text)
  titles.extend([f"{matched_titles[i][3:]}" \
                  for i in tqdm(range(len(matched_titles)))])
  sub_data = {}
  for i in range(len(titles)):
    sub_title = titles[i]
    sub_content = sections[i]
    sub_content = re.sub(r'Not For Distribution, Sale or Reproduction.', ' ', sub_content)
    sub_content = re.sub(r'((D|d)escribed in Section)\s\d+\.\d+\.\d+\.\d+\.', ' ', sub_content)
    sub_content = re.sub(r'ﬂ', 'FL', sub_content)
    sub_content = re.sub(r'ﬁ', 'FI', sub_content)
    sub_content = re.sub(r'\n\d+\d+\d+', ' ', sub_content)
    sub_content = re.sub(r'\s•\s', ' ', sub_content)
    sub_content = re.sub(r'\nu', ' ', sub_content)
    sub_content = re.sub(r'\n', ' ', sub_content)
    sub_content = re.sub(r'\s\.', '.', sub_content)
    sub_content = ' '.join(sub_content.split())
    sub_content = sub_content.lower()
    sub_content = re.sub(r'[^a-z\.]', ' ', sub_content)

    sub_content = remove_stop_words(sub_content)
    sentences = re.split(r'(?<=[A-Za-z])\.\s', sub_content)
    sub_data[sub_title] = [lemmatize_text(sentence, lemm_model) for sentence in sentences]
  sub_data['definition'].pop()
  return sub_data

In [62]:
def extract_definition(row):
    sentence_dict = row['Sentences']
    definition = sentence_dict.get('definition', None)
    if definition:
        row['Definition'] = definition
        del sentence_dict['definition']
        row['Sentences'] = sentence_dict
    return row

In [63]:
def clean_txt(text):
    text = re.sub(r'(Table|Figure) \d+-', 'figure ', text)
    text = text.lower()
    text = re.sub(r'[^a-z0-9 ]', '', text)
    text = ' '.join(text.split())
    return text
def edit_keys(dictionary):
    edited_dict = {key.split(' ', 1)[-1].lower(): value for key, value in dictionary.items()}
    return edited_dict

In [70]:
# segment then content
raw_df_proc['Sentences'] = raw_df_proc['Content'].apply(lambda x: segment_clean_content(x, r'11\.\d+\.\d+\.\d+\s[A-Z][^\n]*'))

# Apply the function to each row in the DataFrame
raw_df_proc = raw_df_proc.apply(extract_definition, axis=1)

# clean segments, Figures & titles
raw_df_proc['Sentences'] = raw_df_proc['Sentences'].apply(edit_keys)
raw_df_proc['Figures'] = raw_df_proc['Figures'].apply(lambda figs: [clean_txt(fig) for fig in figs])
raw_df_proc['Title'] = raw_df_proc['Title'].apply(lambda x: ' '.join(re.sub(r'[0-9]', '', x.lower()).split()))

0it [00:00, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

In [71]:
raw_df_proc.sample(3)

Unnamed: 0,Title,Content,Figures,figures_ref,Sentences,Definition
6,implement risk responses,\nImplement Risk Responses is the process of i...,[figure 18 implement risk responses inputs too...,"[https://i.imgur.com/i2aQozd.png, https://i.im...",{'project management plan': ['project manageme...,[implement risk response process implement agr...
0,prologue,"Not For Distribution, Sale or Reproduction.\n3...",[],[],{},[key concept project risk management project r...
2,identify risks,\nIdentify Risks is the process of identifying...,[figure 6 identify risks inputs tools techniqu...,"[https://i.imgur.com/aKyeTw9.png, https://i.im...",{'project management plan': ['project manageme...,[identify risk process identify individual pro...


---
# Feature Extraction
---

In [72]:
%%capture
!pip install torch transformers bert-extractive-summarizer keybert sentencepiece tensorflow

In [73]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from keybert import KeyBERT

In [74]:
%%capture
summarizer_tokenizer = T5Tokenizer.from_pretrained('t5-base')
summarizer_model = T5ForConditionalGeneration.from_pretrained('t5-base')
kw_model = KeyBERT(model='all-MiniLM-L6-v2')

ImportError: ignored

In [None]:
feat_df = raw_df_proc.copy()

In [None]:
def summarize(text, tokenizer, model):
   inputs = tokenizer.encode("summarize: " + text, return_tensors="pt",\
                             max_length=512, truncation=True)
   outputs = model.generate(inputs, max_length=150, min_length=40,\
                          length_penalty=2.0, num_beams=4, early_stopping=True)
   summary = tokenizer.decode(outputs[0])
   return summary

In [None]:
def extract_keywords(text, kw_model):
    keywords = kw_model.extract_keywords(text)
    keywords = [keyword[0] for keyword in keywords]
    return keywords

In [None]:
# Apply summarization of Definions
feat_df['Summary'] = feat_df['Definition']\
  .progress_apply(lambda expl: re.sub('(<pad>|</s>)', '',\
  summarize(' '.join(expl), summarizer_tokenizer, summarizer_model)))

In [None]:
test_text = list(feat_df['Sentences'].values[1].values())
pprint(test_text)

In [None]:
# Apply summarization of Definions
feat_df['keywords'] = feat_df['content'].progress_apply(lambda x: extract_keywords(x, kw_model))

In [None]:
feat_df.sample()