```
Clone repository
git clone https://github.com/mage-ai/rag-project
cd rag-project
navigate to the rag-project/llm directory, add spacy to the requirements.txt.
Then update the Dockerfile found in the rag-project directory with the following:
RUN python -m spacy download en_core_web_sm
Run
./scripts/start.sh
Once started, go to http://localhost:6789/
```

In [3]:
%pip install python-docx -qq

In [1]:
import io

import requests

import docx

In [2]:
def clean_line(line):
    line = line.strip()
    line = line.strip('\uFEFF')
    return line

def read_faq(file_id):
    url = f'https://docs.google.com/document/d/{file_id}/export?format=docx'
    
    response = requests.get(url)
    response.raise_for_status()
    
    with io.BytesIO(response.content) as f_in:
        doc = docx.Document(f_in)

    questions = []

    question_heading_style = 'heading 2'
    section_heading_style = 'heading 1'
    
    heading_id = ''
    section_title = ''
    question_title = ''
    answer_text_so_far = ''
     
    for p in doc.paragraphs:
        style = p.style.name.lower()
        p_text = clean_line(p.text)
    
        if len(p_text) == 0:
            continue
    
        if style == section_heading_style:
            section_title = p_text
            continue
    
        if style == question_heading_style:
            answer_text_so_far = answer_text_so_far.strip()
            if answer_text_so_far != '' and section_title != '' and question_title != '':
                questions.append({
                    'text': answer_text_so_far,
                    'section': section_title,
                    'question': question_title,
                })
                answer_text_so_far = ''
    
            question_title = p_text
            continue
        
        answer_text_so_far += '\n' + p_text
    
    answer_text_so_far = answer_text_so_far.strip()
    if answer_text_so_far != '' and section_title != '' and question_title != '':
        questions.append({
            'text': answer_text_so_far,
            'section': section_title,
            'question': question_title,
        })

    return questions
    

In [3]:
faq_documents = {
    'llm-zoomcamp': '1qZjwHkvP0lXHiE4zdbWyUXSVfmVGzougDD6N37bat3E',
}
documents = []

for course, file_id in faq_documents.items():
    print(course)
    course_documents = read_faq(file_id)
    documents.append({'course': course, 'documents': course_documents})

llm-zoomcamp


In [4]:
import json
with open('llm_faq_documents_version_1.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [8]:
!head documents.json

[
  {
    "course": "llm-zoomcamp",
    "documents": [
      {
        "text": "Yes, but if you want to receive a certificate, you need to submit your project while we\u2019re still accepting submissions.",
        "section": "General course-related questions",
        "question": "I just discovered the course. Can I still join?"
      },
      {
