In [2]:
import os
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import warnings

def preprocess_pdf(file_path):
    images = convert_from_path(file_path)
    text_pages = []
    for image in images:
        text = pytesseract.image_to_string(image)
        text_pages.append(text)
    return '\n'.join(text_pages)

def preprocess_image(file_path):
    image = Image.open(file_path)
    # Apply preprocessing techniques to enhance image quality (e.g., denoising, thresholding, resizing)
    image = image.resize((desired_width, desired_height))
    image = image.filter(ImageFilter.Denoise())
    image = image.convert('L').point(lambda x: 0 if x < threshold else 255, '1')
    text = pytesseract.image_to_string(image)
    return text

pdf_directory = 'pdf_documents'
image_directory = 'image_documents'
preprocessed_texts = []

# Preprocess PDF files
pdf_texts = []
for filename in os.listdir(pdf_directory):
    if filename.endswith('.pdf'):
        file_path = os.path.join(pdf_directory, filename)
        preprocessed_text = preprocess_pdf(file_path)
        pdf_texts.append(preprocessed_text)

# Preprocess image files
image_texts = []
for filename in os.listdir(image_directory):
    if filename.endswith('.jpg') or filename.endswith('.png'):
        file_path = os.path.join(image_directory, filename)
        preprocessed_text = preprocess_image(file_path)
        image_texts.append(preprocessed_text)

# Combine PDF and image texts
all_texts = pdf_texts + image_texts

# Rest of your code for feature extraction, training the model, etc.


In [3]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer to convert text documents into a matrix of token counts
vectorizer = CountVectorizer(stop_words='english', lowercase=True, decode_error='ignore')
X = vectorizer.fit_transform(all_texts)

# Define the number of topics
num_topics = 10

# Apply Latent Dirichlet Allocation (LDA)
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(X)

# Get the topic-word matrix
topic_word_matrix = lda.components_

# Get the most important words for each topic
num_top_words = 30
feature_names = vectorizer.get_feature_names()
topics = []
for topic_idx, topic in enumerate(topic_word_matrix):
    top_words = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
    topics.append(top_words)

# Print the topics and their associated keywords
for topic_idx, top_words in enumerate(topics):
    print(f"Topic {topic_idx + 1}:")
    print(", ".join(top_words))
    print()


Topic 1:
reading, ieltsfever, com, people, test, academic, www, passage, music, answer, questions, help, https, write, public, information, new, says, sheet, boxes, fb, water, used, ee, use, work, eltsfever, following, years, human

Topic 2:
banana, island, easter, crocodiles, body, water, paragraph, disease, statues, moai, crocodile, years, varieties, genetic, sigatoka, bananas, fruit, long, crocodilian, edible, crops, resistant, world, year, survive, paragraphs, theories, breeders, plants, stone

Topic 3:
innovation, fatty, trans, acids, food, passage, com, products, questions, new, answer, people, reading, companies, information, test, ieltsfever, 20, write, www, process, sheet, boxes, biofuels, crops, company, risk, ielts, based, energy

Topic 4:
world, old, new, ee, seaweeds, 45, ieltsfever, fb, https, oooo, academic, species, oe, helpg, optimism, section, test, animals, ae, water, sea, oo, eltsfever, america, optimists, seaweed, european, people, com, ooo

Topic 5:
reading, ielts

