In [1]:
# RAG English Teacher System
# This system uses Retrieval-Augmented Generation (RAG) to create English lessons.
# It combines a vector database for storing and retrieving relevant information
# with a language model for generating engaging lessons.

# Necessary imports

import os
import torch
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions
import numpy as np
import PyPDF2
from fpdf import FPDF
from functools import lru_cache
from groq import Groq
import nltk
from nltk.tokenize import sent_tokenize
import uuid
from dotenv import load_dotenv

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Set up the working directory and NLTK data

working_directory = os.getcwd()  # This gets the current working directory
nltk_data_dir = os.path.join(working_directory, 'nltk_data')  # Create 'nltk_data' folder in your project

# Add this directory to the nltk data path
nltk.data.path.append(nltk_data_dir)

# Redownload 'punkt' just to make sure it's using the right path
nltk.download('punkt_tab', download_dir=nltk_data_dir)


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/nikhil/Desktop/project-folder/RAG_AI_Tutor/AI-
[nltk_data]     Tutor/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
# Load environment variables from the .env file
load_dotenv()

# Access the API key
api_key = os.getenv('api_key')

# Set your Groq API key
os.environ["GROQ_API_KEY"] = api_key

In [4]:
# Create the vector database class

class VectorDatabase:
    """
    A class to handle vector database operations using Chroma DB.
    """

    def __init__(self, collection_name="english_teacher_collection"):
        # Initialize the Chroma DB client and create/get the collection
        self.client = chromadb.PersistentClient(path="./chroma_db")
        self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
        self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name='all-MiniLM-L6-v2')
        self.collection = self.client.get_or_create_collection(name=collection_name, embedding_function=self.embedding_function)

    def add_text(self, text, chunk_size):
        sentences = sent_tokenize(text, language="english")
        chunks = self._create_chunks(sentences, chunk_size)
        
        ids = [str(uuid.uuid4()) for _ in chunks]
        self.collection.add(
            documents=chunks,
            ids=ids
        )

    def _create_chunks(self, sentences, chunk_size):
        chunks = []
        for i in range(0, len(sentences), chunk_size):
            chunk = ' '.join(sentences[i:i+chunk_size])
            chunks.append(chunk)
        return chunks

    def retrieve(self, query, k=3):
        results = self.collection.query(query_texts=[query], n_results=k)
        return results['documents'][0]

In [5]:
class GroqGenerator:
    """
    A class to handle text generation using the Groq API.
    """
    def __init__(self, model_name='mixtral-8x7b-32768'):
        self.model_name = model_name
        self.client = Groq()

    def generate_lesson(self, topic, retrieved_content):
        prompt = f"Create an engaging English lesson about {topic}. Use the following information:\n"
        prompt += "\n\n".join(retrieved_content)
        prompt += "\n\nLesson:"

        chat_completion = self.client.chat.completions.create(
            model=self.model_name,
            messages=[
                {"role": "system",
                 "content": """
                You are an AI English teacher designed to create an elaborative and engaging lesson for students in an online setting. Your role is to generate a comprehensive lesson that helps the student fully understand the chosen topic. Since the lesson will only be created once, it must be detailed, covering all key aspects of the topic. For every topic provided, ensure the lesson includes:

                    Thorough Structure: Organize the lesson with clear sections, including an introduction, key concepts, examples, and a conclusion.
                    In-Depth Explanations: Provide detailed explanations of the topic, breaking down complex concepts into simple, easy-to-understand language, and offering relevant examples to clarify your points.
                    Self-Contained Learning: Ensure that the lesson is elaborative enough to allow the student to fully grasp the topic without needing further assistance.
                    Engaging and Interactive Elements: Although this is a single lesson, include creative learning activities such as vocabulary exercises, comprehension questions, and reflection prompts to enhance understanding.
                    Conversational Tone: Maintain a friendly, supportive, and encouraging tone, as if speaking directly to the student in a one-on-one setting.
                    Real-World Examples: Use relevant, real-world examples and scenarios to make the lesson more relatable and memorable.
                    Encouraging Practice: Conclude the lesson by encouraging the student to review the material and practice further on their own."""},
                {"role": "user", "content": prompt}
            ],
            max_tokens=1000,
            temperature=0.7
        )

        return chat_completion.choices[0].message.content

In [6]:
class RAGEnglishTeacher:
    """
    A class that combines the VectorDatabase and GroqGenerator to create an AI English teacher.
    """
    def __init__(self, vector_db, generator):
        self.vector_db = vector_db
        self.generator = generator

    @lru_cache(maxsize=32)
    def teach(self, topic):
        relevant_content = self.vector_db.retrieve(topic)
        lesson = self.generator.generate_lesson(topic, relevant_content)
        return lesson

In [7]:
# Helper functions

def extract_text_from_pdf(pdf_file_path):
    try:
        with open(pdf_file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text()
        return text
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        return None

def save_lesson_as_pdf(topic, lesson, output_dir='lessons'):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    pdf.cell(200, 10, txt=f"Lesson on {topic}", ln=1, align="C")
    pdf.multi_cell(0, 10, txt=lesson)
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    filename = f"{output_dir}/lesson_{topic.replace(' ', '_')}.pdf"
    pdf.output(filename)
    return filename

In [8]:
# Initialize the RAG English Teacher system
vector_db = VectorDatabase()
generator = GroqGenerator()
teacher = RAGEnglishTeacher(vector_db, generator)

In [9]:
# Adding content to the Vector database

def add_content_to_vector_db(content, chunk_size=5):
    vector_db.add_text(content, chunk_size)
    print("Content added to the vector database.")

def add_pdf_to_vector_db(pdf_path, chunk_size=5):
    pdf_text = extract_text_from_pdf(pdf_path)
    if pdf_text:
        add_content_to_vector_db(pdf_text, chunk_size)
        print(f"Content from {pdf_path} added to the vector database.")
    else:
        print(f"Failed to extract text from {pdf_path}")

In [10]:
# Generating and saving the lesson

def generate_lesson(topic):
    lesson = teacher.teach(topic)
    print(f"Lesson on {topic}:\n")
    print(lesson)
    return lesson

def save_lesson(topic, lesson):
    if lesson:
        filename = save_lesson_as_pdf(topic, lesson)
        print(f"Lesson saved as PDF: {filename}")
    else:
        print("No lesson to save.")

In [11]:
# Add content from a PDF file
add_pdf_to_vector_db('./A-Students-Introduction-to-English-Grammar (1).pdf', chunk_size=10)

unknown widths : 
[0, IndirectObject(6603, 0, 6005352752)]
unknown widths : 
[0, IndirectObject(6812, 0, 6005352752)]
unknown widths : 
[0, IndirectObject(6700, 0, 6005352752)]
unknown widths : 
[0, IndirectObject(7339, 0, 6005352752)]
unknown widths : 
[0, IndirectObject(7191, 0, 6005352752)]
unknown widths : 
[0, IndirectObject(6603, 0, 6005352752)]
unknown widths : 
[0, IndirectObject(9005, 0, 6005352752)]
unknown widths : 
[0, IndirectObject(6643, 0, 6005352752)]
unknown widths : 
[0, IndirectObject(7239, 0, 6005352752)]
unknown widths : 
[0, IndirectObject(6588, 0, 6005352752)]
unknown widths : 
[0, IndirectObject(6633, 0, 6005352752)]
unknown widths : 
[0, IndirectObject(6712, 0, 6005352752)]
unknown widths : 
[0, IndirectObject(6588, 0, 6005352752)]
unknown widths : 
[0, IndirectObject(7191, 0, 6005352752)]
unknown widths : 
[0, IndirectObject(6643, 0, 6005352752)]
unknown widths : 
[0, IndirectObject(6603, 0, 6005352752)]
unknown widths : 
[0, IndirectObject(6613, 0, 6005352752

Content added to the vector database.
Content from ./A-Students-Introduction-to-English-Grammar (1).pdf added to the vector database.


In [12]:

# Generate a lesson
lesson = generate_lesson("Prepositions")


Lesson on Prepositions:

**Lesson Title:** Understanding Prepositions: Breaking Down the Building Blocks of English Sentences

**Introduction**
Hello and welcome to our comprehensive lesson on prepositions! Prepositions are crucial words in English that help connect other words and phrases together, creating meaningful sentences. They are often small words, but they carry great importance in expressing relationships between different elements in a sentence.

**Key Concepts**
1. Prepositions are words that show relationships between other words in a sentence, such as location, time, direction, and manner.
2. Traditional grammars list around 100 prepositions, but our treatment includes more, as we recognize words formerly classified as adverbs or subordinating conjunctions as prepositions.
3. Prepositions typically take Noun Phrases (NPs) as their complements. However, we will see that this is not always the case.

**In-Depth Explanations**

*Prepositions and Noun Phrases*
Prepositions a

In [13]:
# Save the lesson as PDF
save_lesson("Prepositions", lesson)

Lesson saved as PDF: lessons/lesson_Prepositions.pdf
