# Install dependencies

In [16]:
!pip install openai==0.28.0 langchain python-dotenv tqdm



Setup Environment

In [17]:
!pip install python-pptx



In [18]:
!pip install PyPDF2



Text Extraction and Chunking

In [None]:
import os
import glob
import openai
from pptx import Presentation
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
from tqdm import tqdm
# Load API keys from .env
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# Setup Directories
BASE_DIR = "/CTSE"
TEXT_DIR = "ctse_extracted_txt_files"
os.makedirs(TEXT_DIR, exist_ok=True)

def extract_pdf_text(filepath):
    try:
        with open(filepath, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
        return text
    except Exception as e:
        print(f"Error extracting PDF {filepath}: {e}")
        return ""

# Function to extract text from PPTX
def extract_pptx_text(filepath):
    try:
        prs = Presentation(filepath)
        text = ""
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text += shape.text + "\n"
        return text
    except Exception as e:
        print(f"Error extracting PPTX {filepath}: {e}")
        return ""

# Function to extract and chunk text
def extract_and_chunk_text():
    text_with_topics = []
    for filepath in tqdm(glob.glob(f"{BASE_DIR}/**/*.pdf", recursive=True)):
        text = extract_pdf_text(filepath)
        if text:
            topic = os.path.basename(filepath).replace(".pdf", "")
            text_with_topics.append((topic, text))

    for filepath in tqdm(glob.glob(f"{BASE_DIR}/**/*.pptx", recursive=True)):
        text = extract_pptx_text(filepath)
        if text:
            topic = os.path.basename(filepath).replace(".pptx", "")
            text_with_topics.append((topic, text))

    return text_with_topics


In [22]:
# Function to get answers from OpenAI API
def get_answer_from_openai(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # Update to a suitable chat model
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},  # System message for context
            {"role": "user", "content": prompt}  # User message with the actual prompt
        ],
        max_tokens=150
    )
    return response.choices[0].message['content'].strip()  # Access the answer from the message content

In [23]:
import os
# Function to save question and answer to a file
def save_qa_to_file(topic, question, answer):
    # Define the file path where questions and answers will be saved
    qa_file_path = f"{TEXT_DIR}/{topic}_qa.txt"

    # Create the directory if it doesn't exist
    os.makedirs(os.path.dirname(qa_file_path), exist_ok=True)

    # Open the file in append mode, so that new QAs are added without overwriting
    with open(qa_file_path, "a", encoding="utf-8") as f:
        f.write(f"Question: {question}\n")
        f.write(f"Answer: {answer}\n")
        f.write("-" * 80 + "\n")  # Divider for readability

In [28]:
# Function to handle user interaction, select topic, and generate Q&A
def interact_and_generate_qa(text_with_topics):
    print("Available topics:")
    topics = list(set([topic for topic, _ in text_with_topics]))  # Extract unique topics
    for idx, topic in enumerate(topics, 1):
        print(f"{idx}. {topic}")  # Print each topic

    # Ask user to select a topic
    try:
        selected_topic_idx = int(input("Select a topic by number: ")) - 1
        selected_topic = topics[selected_topic_idx]  # Retrieve selected topic
    except (ValueError, IndexError):
        print("Invalid selection. Please try again.")
        return

    # Get chunks related to the selected topic
    selected_chunks = [text for topic, text in text_with_topics if topic == selected_topic]

    # Ask user for a question
    question = input(f"Ask a question related to the topic '{selected_topic}': ")

    # Combine context (selected chunks) and question to form the prompt
    context = "\n".join(selected_chunks)[:3000]  # Limit context size to avoid token limits
    # print(f"\nContext being sent to OpenAI API for the question:")
    # print(context)

    prompt = f"Answer the following question based on the context:\n\nContext: {context}\n\nQuestion: {question}"

    # Get the answer from OpenAI API
    answer = get_answer_from_openai(prompt)

    # Display the generated answer
    print(f"Generated Answer: {answer}")

    # Save the question and answer to a file
    save_qa_to_file(selected_topic, question, answer)

"""### Start the Extraction and Q&A Generation Process"""

# Extract and chunk the text from the slides
text_with_topics = extract_and_chunk_text()

# Start the interactive session
interact_and_generate_qa(text_with_topics)

0it [00:00, ?it/s]
100%|██████████| 12/12 [00:00<00:00, 16.94it/s]


Available topics:
1. Intro to DevOps and Beyond
2. Introduction to Microservices
3. Containers 101
4. CAP Theorem
5. Cloud Design Patterns - 1
6. AWS User Groups Colombo - Introduction to AWS Cloud Platform
7. Microservice Design Patterns
8. Lecture 2 - Part 1
9. Key Essentials for Building Application in Cloud
10. Cloud Computing 101
11. Cloud Design Patterns - 2
12. Lecture 2 - Part 2
Select a topic by number: 5
Ask a question related to the topic 'Cloud Design Patterns - 1': tell me the types of Cloud Design Patterns
Generated Answer: Based on the provided context, the types of Cloud Design Patterns mentioned are as follows:

1. Cache-Aside Pattern
   - Solutions: Azure Cache, AWS ElastiCache, Google App Engine memcache, Redis Cache
   - Pros: Increased performance
   - Cons: Maintaining consistency between data in cache & data in the underlying data store
   - Parameters: What to cache, Lifetime of cached data, Cache size, Evicting data, In-Memory Caching for read/write performance