In [27]:
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
import pytesseract

In [None]:
from pdf2image import convert_from_path
import spacy
spacy.cli.download("en_core_web_sm")
from textblob import TextBlob
import re

In [None]:
def extract_text_from_pdf(pdf_path):
    # Convert PDF to images (each page is an image)
    pages = convert_from_path(pdf_path)
    text = ""
    for page in pages:
        text += pytesseract.image_to_string(page)
    return text


In [None]:
def extract_text_from_image(image_path):
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image)
    return text


In [None]:
nlp = spacy.load("en_core_web_sm")

def extract_entities(text):
    doc = nlp(text)
    entities = {"emails": [], "phone_numbers": [], "dates": [], "addresses": []}
    
    # Use regex for emails and phone numbers
    emails = re.findall(r'\S+@\S+', text)
    phone_numbers = re.findall(r'\b\d{10}\b', text)  # Adjust regex for various phone formats

    # Add emails and phone numbers to entities dictionary
    entities["emails"].extend(emails)
    entities["phone_numbers"].extend(phone_numbers)

    # Use SpaCy's built-in NER for dates, addresses, etc.
    for ent in doc.ents:
        if ent.label_ == "DATE":
            entities["dates"].append(ent.text)
        elif ent.label_ == "GPE" or ent.label_ == "LOC":
            entities["addresses"].append(ent.text)

    return entities


In [None]:
def get_sentiment(text):
    blob = TextBlob(text)
    sentiment_score = blob.sentiment.polarity
    return sentiment_score


In [None]:
from collections import Counter
from spacy.lang.en.stop_words import STOP_WORDS

def extract_keywords(text, num_keywords=5):
    words = [word for word in text.lower().split() if word.isalpha() and word not in STOP_WORDS]
    common_words = Counter(words).most_common(num_keywords)
    keywords = [word for word, freq in common_words]
    return keywords


In [None]:
def process_file(file_path, file_type='pdf'):
    # Extract text
    if file_type == 'pdf':
        text = extract_text_from_pdf(file_path)
    elif file_type == 'image':
        text = extract_text_from_image(file_path)
    else:
        print("Unsupported file type.")
        return

    # Display extracted text (optional)
    print("Extracted Text:\n", text)

    # Analyze entities
    entities = extract_entities(text)
    print("\nExtracted Entities:")
    for entity_type, values in entities.items():
        print(f"{entity_type.capitalize()}: {values}")

    # Sentiment Analysis
    sentiment_score = get_sentiment(text)
    print("\nSentiment Score:", sentiment_score)

    # Keyword Extraction
    keywords = extract_keywords(text)
    print("\nKeywords:", keywords)


In [None]:
# Example usage
file_path = '../DocumentClassification/data/email/doc_000694.png'  # Replace with your PDF or image file path
file_type = 'image'  # 'pdf' or 'image'
process_file(file_path, file_type)


In [None]:
import json
import spacy
from textblob import TextBlob
from transformers import pipeline
import pdf2image
import pytesseract

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Load summarization model
summarizer = pipeline("summarization")

def extract_text_from_pdf(file_path):
    # Code for extracting text from PDF
    # Using pdf2image and pytesseract
    images = pdf2image.convert_from_path(file_path)
    text = ""
    for img in images:
        text += pytesseract.image_to_string(img)
    return text

def extract_text_from_image(file_path):
    # Extract text from an image file
    text = pytesseract.image_to_string(file_path)
    return text

def extract_entities(text):
    # Extract named entities
    doc = nlp(text)
    entities = {}
    for ent in doc.ents:
        if ent.label_ not in entities:
            entities[ent.label_] = []
        entities[ent.label_].append(ent.text)
    return entities

def get_sentiment(text):
    # Get sentiment score using TextBlob
    blob = TextBlob(text)
    return blob.sentiment.polarity

def extract_keywords(text):
    # Extract keywords (e.g., using noun chunks)
    doc = nlp(text)
    keywords = [chunk.text for chunk in doc.noun_chunks]
    return keywords

def summarize_text(text, max_length=130, min_length=30):
    # Summarize text using a pre-trained model
    summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
    return summary[0]['summary_text']

# def detect_language(text):
#     # Use spaCy's `lang` pipeline for language detection
#     # Load `xx_ent_wiki_sm` or another language model
#     nlp_lang = spacy.load("en_core_web_sm")
#     nlp_lang = spacy.blank("xx")  # Multilingual blank model
#     nlp_lang.add_pipe("language_detector")
#     language = nlp_lang(text)._.language
#     return language


def process_file(file_path, file_type='pdf'):
    # Extract text
    if file_type == 'pdf':
        text = extract_text_from_pdf(file_path)
    elif file_type == 'image':
        text = extract_text_from_image(file_path)
    else:
        print("Unsupported file type.")
        return

    # Display extracted text (optional)
    print("Extracted Text:\n", text)

    # Analyze entities
    entities = extract_entities(text)
    print("\nExtracted Entities:")
    for entity_type, values in entities.items():
        print(f"{entity_type.capitalize()}: {values}")

    # Sentiment Analysis
    sentiment_score = get_sentiment(text)
    print("\nSentiment Score:", sentiment_score)

    # Keyword Extraction
    keywords = extract_keywords(text)
    print("\nKeywords:", keywords)

    # Summarization
    summary = summarize_text(text)
    print("\nSummary:", summary)

    # Output everything as JSON
    output_data = {
        "extracted_text": text,
        "entities": entities,
        "sentiment_score": sentiment_score,
        "keywords": keywords,
        "summary": summary,
    }

    # Print or store JSON
    json_output = json.dumps(output_data, indent=4)
    print("\nJSON Output:", json_output)
    return output_data

# Example usage:
# process_file('path/to/your/file.pdf', file_type='pdf')
# Example usage
file_path = '../DocumentClassification/data/email/doc_000694.png'  # Replace with your PDF or image file path
file_type = 'image'  # 'pdf' or 'image'
process_file(file_path, file_type)

