In [37]:
import re
import nltk
import pandas as pd
import PyPDF2
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

def extract_text_from_pdf(file_path):
    with open(file_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        full_text = []
        for page in reader.pages:
            text = page.extract_text()
            if text:
                text = re.sub(r'-\n(\w+)', r'\1', text)  # Handle line breaks within words
                full_text.append(text.replace('\n', ' '))  # Replace line breaks with spaces in text
    return " ".join(full_text)

def extract_and_summarize(text, start_keywords, end_keywords, word_limit=100):
    sentences = sent_tokenize(text)
    extracted_text = []
    capture = False
    word_count = 0
    extending = False  # To indicate whether we're extending to reach a full stop after the word limit

    start_patterns = [r'\b' + re.escape(keyword) + r'\b' for keyword in start_keywords]
    end_patterns = [r'\b' + re.escape(keyword) + r'\b' for keyword in end_keywords]

    start_regex = re.compile('|'.join(start_patterns), re.IGNORECASE)
    end_regex = re.compile('|'.join(end_patterns), re.IGNORECASE)

    for sentence in sentences:
        if start_regex.search(sentence):
            capture = True
        if capture:
            current_words = sentence.split()
            if word_count + len(current_words) > word_limit and not extending:
                extending = True  # Start looking for a period to end the summary
            word_count += len(current_words)
            extracted_text.append(sentence)
            if extending and sentence.endswith('.'):
                break  # Stop at the first full stop after reaching the word limit
            if end_regex.search(sentence):
                break

    return ' '.join(extracted_text)

composition_keywords = [
    'mg', 'containing', 'active ingredient', 'composition', 'contains', "Ingredients", "Active Ingredients", 
    "Inactive Ingredients", "Compounds", "Concentration", "Formulation", "Content", "Dosage", "Excipients", 
    "Chemical Composition", "Additives", "Preservatives", "Nutritional Information", "Allergens", "API", 
    "Molecular Formula", "Batches", "Synthesis", "Properties", "Standards"
]

contraindications_keywords = [
    'contraindications', 'do not take', 'not take if', 'warnings', 'do not use', 'should not be taken', 
    "Contraindications", "Warnings", "Precautions", "Risk Factors", "Avoid", "Not Recommended", "Hypersensitivity", 
    "Allergic Reactions", "Adverse Reactions", "Safety Alerts", "Interactions", "Medical Conditions", 
    "Prohibited", "Disallowed", "Health Risks", "Limitations", "Do Not Use If", "Exclusion Criteria", 
    "Unsuitable For", "Health Warnings"
]

adverse_reactions_keywords = [
    'stop', 'adverse reactions', 'side effects', 'possible side effects', 'unwanted effects', 'reactions', 
    "Adverse Reactions", "Side Effects", "Complications", "Risks", "Warnings", "Symptoms", "Undesirable Effects", 
    "Negative Effects", "Harmful Effects", "Allergic Reactions", "Toxicity", "Safety Concerns", "Intolerance", 
    "Discomfort", "Consequences", "Aftereffects", "Hypersensitivity", "Counteractions", "Unwanted Effects"
]

pregnancy_keywords = [
    'pregnant', 'pregnancy', 'breast-feeding and fertility', 'breast-feeding', 'during pregnancy', "Pregnancy", 
    "Expectant Mother", "Gestation", "Trimester", "Prenatal Care", "Maternity", "Obstetrics", "Ob/Gyn", 
    "Fetal Development", "Ultrasound", "Birth Plan", "Due Date", "Conception", "Antenatal", "Childbirth", 
    "Labor and Delivery", "Postpartum", "Breastfeeding", "Infant Care", "Family Planning", "Contraception"
]

driving_keywords = [
    'capability to drive a car', 'driving', 'effects on ability to drive', 'driving and using machines', 'drive', 
    "Driving", "Operate Machinery", "Motor Skills", "Coordination", "Alertness", "Impairment", "Drowsiness", 
    "Sedation", "Concentration", "Attention", "Reflexes", "Reaction Time", "Caution", "Warning", "Do Not Drive", 
    "Vehicle Operation", "Safety Advice", "Influence on Driving", "Cognitive Function", "Physical Ability"
]

# Extract text from PDF
pdf_path = 'ibuprofen_leaflet.pdf'  
text = extract_text_from_pdf(pdf_path)

# Extract and summarize each section
composition_summary = extract_and_summarize(text, composition_keywords, contraindications_keywords)
contraindications_summary = extract_and_summarize(text, contraindications_keywords, adverse_reactions_keywords)
adverse_reactions_summary = extract_and_summarize(text, adverse_reactions_keywords, pregnancy_keywords)
pregnancy_summary = extract_and_summarize(text, pregnancy_keywords, driving_keywords)
driving_summary = extract_and_summarize(text, driving_keywords, ['end of document'])

# Creating DataFrame
data = {
    "Section": ["Composition", "Contraindications", "Adverse Reactions", "Pregnancy", "Capability to Drive"],
    "Content": [composition_summary, contraindications_summary, adverse_reactions_summary, pregnancy_summary, driving_summary]
}
df = pd.DataFrame(data)
df.head(5)

# Save to Excel
excel_path = 'output.xlsx'  
df.to_excel(excel_path, index=False, engine='openpyxl')


[nltk_data] Downloading package punkt to /Users/ibalica/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
