<a href="https://colab.research.google.com/github/khalilDimassi/Datasci_Resources/blob/master/project_v0_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install PyPDF2 networkx nltk gensim

Collecting PyPDF2
  Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import PyPDF2
import re
import networkx as nx

import nltk
nltk.download()
import nltk.corpus
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

import gensim
from gensim.models import LdaModel
from gensim.corpora import Dictionary



In [None]:
def extract_text_from_pdf(file_path):
  with open(file_path, 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    text = ''
    for page in reader.pages:
      text += page.extract_text()
  return text

def clean_text(text):
  text = re.sub('[^a-zA-Z ]', '', text)  # remove punctuation
  text = ' '.join(text.split())  # remove extra spaces
  return text

def standardize_text(text):
  return text.lower()


def stem_text(text):
  stemmer = PorterStemmer()
  words = text.split()
  stemmed_words = [stemmer.stem(word) for word in words]
  return stemmed_words

def remove_stop_words(stemmed_words, custom_stop_words=None):
  stop_words = set(stopwords.words('english'))
  if custom_stop_words is not None:
    stop_words.update(custom_stop_words)
  filtered_words = [word for word in stemmed_words if word not in stop_words]
  return filtered_words


def lda_segmentation(documents):
  dictionary = Dictionary(documents)
  corpus = [dictionary.doc2bow(text) for text in documents]
  model = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15)
  topic_distributions = model.get_document_topics(corpus)
  dominant_topics = [max(topic_distribution, key=topic_distribution.get) for topic_distribution in topic_distributions]

  # Segment the text at the boundaries between topic changes
  segments = []
  current_topic = dominant_topics[0]
  current_segment = []
  for i in range(len(documents)):
    if dominant_topics[i] != current_topic:
      segments.append(current_segment)
      current_segment = []
    current_segment.append(documents[i])
  segments.append(current_segment)

  return segments


def create_graph_from_segments(segments):
  graph = nx.Graph()
  # Add a node for each segment
  for segment in segments:
    node_id = len(graph)
    graph.add_node(node_id, features=segment)
  # Add edges between the nodes based on their similarity
  for node_id1 in range(len(graph)):
    for node_id2 in range(node_id1 + 1, len(graph)):
      similarity = nx.jaccard_coefficient(graph[node_id1]['features'], graph[node_id2]['features'])
      if similarity > 0.5:
        graph.add_edge(node_id1, node_id2, weight=similarity)
  return graph


In [None]:
text = extract_text_from_pdf("/content/PMBOK ch11.pdf")
text = clean_text(text)
text = standardize_text(text)
stems = stem_text(text)
words = remove_stop_words(stems, custom_stop_words=None)

In [None]:
import pandas as pd

ListData = []

# Define the regular expression pattern
pattern = r'11\.\d+(\.\d+)* [A-Z][^\n]*'

# Find the first match
matches = re.finditer(pattern, text)

for match in matches:
    # Get the matched title
    title = match.group(0)
    print("Title:", title.strip())

    # Find the start and end positions of the matched title
    start = match.start()
    end = match.end()

    # Find the content following the matched title
    content = text[end:]

    # Find the next match in the remaining content
    next_match = re.search(pattern, content)

    if next_match:
        # If a next match is found, extract it
        next_title = next_match.group(0)
        print("Next Title:", next_title.strip())
    else:
        print("No more titles found.")

    print("Content:", content.strip())
    print("----")
    couple = (title.strip() , content.strip())
    ListData.append(couple)


df = pd.DataFrame(ListData, columns=['title', 'Body'])
df